From 869b66f6b3129f3ec5550dffbfffbca67c9261ac Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Sun, 23 Feb 2020 21:05:22 +0100
Subject: [PATCH 01/80] * Added support for Albert when fine-tuning for NER

* Added support for Albert in NER pipeline

* Added command-line options to examples/ner/run_ner.py to better control tokenization

* Added class AlbertForTokenClassification

* Changed output for NerPipeline to use .convert_ids_to_tokens(...) instead of .decode(...) to better reflect tokens
---
 examples/ner/run_ner.py             | 30 ++++++++-
 src/transformers/__init__.py        |  1 +
 src/transformers/modeling_albert.py | 99 ++++++++++++++++++++++++++++-
 src/transformers/modeling_auto.py   |  2 +
 src/transformers/pipelines.py       |  2 +-
 5 files changed, 129 insertions(+), 5 deletions(-)

diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index 70a7e9f8a7..9c24f7ade3 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -33,6 +33,9 @@ from tqdm import tqdm, trange
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
+    AlbertConfig,
+    AlbertForTokenClassification,
+    AlbertTokenizer,
     BertConfig,
     BertForTokenClassification,
     BertTokenizer,
@@ -70,6 +73,7 @@ ALL_MODELS = sum(
 )
 
 MODEL_CLASSES = {
+    "albert": (AlbertConfig, AlbertForTokenClassification, AlbertTokenizer),
     "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
     "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
     "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
@@ -77,6 +81,8 @@ MODEL_CLASSES = {
     "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
 }
 
+TOKENIZER_ARGS = ["do_lower_case", "strip_accents", "keep_accents", "use_fast"]
+
 
 def set_seed(args):
     random.seed(args.seed)
@@ -463,6 +469,22 @@ def main():
         "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
     )
 
+    parser.add_argument(
+        "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents."
+    )
+
+    parser.add_argument(
+        "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents."
+    )
+
+    parser.add_argument(
+        "--nouse_fast",
+        action="store_const",
+        dest="use_fast",
+        const=False,
+        help="Set this flag to not use fast tokenization.",
+    )
+
     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
     parser.add_argument(
         "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
@@ -590,10 +612,12 @@ def main():
         label2id={label: i for i, label in enumerate(labels)},
         cache_dir=args.cache_dir if args.cache_dir else None,
     )
+    tokenizer_args = {k: v for k, v in vars(args).items() if v != None and k in TOKENIZER_ARGS}
+    logger.info("Tokenizer arguments: %s", tokenizer_args)
     tokenizer = tokenizer_class.from_pretrained(
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
         cache_dir=args.cache_dir if args.cache_dir else None,
+        **tokenizer_args
     )
     model = model_class.from_pretrained(
         args.model_name_or_path,
@@ -636,7 +660,7 @@ def main():
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(
@@ -658,7 +682,7 @@ def main():
                 writer.write("{} = {}\n".format(key, str(results[key])))
 
     if args.do_predict and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args)
         model = model_class.from_pretrained(args.output_dir)
         model.to(args.device)
         result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ebacba6fdc..2d5bdd5e6a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -255,6 +255,7 @@ if is_torch_available():
         AlbertForMaskedLM,
         AlbertForSequenceClassification,
         AlbertForQuestionAnswering,
+        AlbertForTokenClassification,
         load_tf_weights_in_albert,
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index 589a20d30d..dbaa50f565 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -600,7 +600,7 @@ class AlbertMLMHead(nn.Module):
         hidden_states = self.LayerNorm(hidden_states)
         hidden_states = self.decoder(hidden_states)
 
-        prediction_scores = hidden_states
+        prediction_scores = hidden_states + self.bias
 
         return prediction_scores
 
@@ -788,6 +788,103 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
+@add_start_docstrings(
+    """Albert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForTokenClassification(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.albert = AlbertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    Examples::
+
+        from transformers import AlbertTokenizer, AlbertForTokenClassification
+        import torch
+
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+
+        loss, scores = outputs[:2]
+
+        """
+
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
 @add_start_docstrings(
     """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index ae7d88d5a3..dfca7d9d78 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -42,6 +42,7 @@ from .modeling_albert import (
     AlbertForMaskedLM,
     AlbertForQuestionAnswering,
     AlbertForSequenceClassification,
+    AlbertForTokenClassification,
     AlbertModel,
 )
 from .modeling_bart import BART_PRETRAINED_MODEL_ARCHIVE_MAP, BartForMaskedLM, BartForSequenceClassification, BartModel
@@ -233,6 +234,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
         (RobertaConfig, RobertaForTokenClassification),
         (BertConfig, BertForTokenClassification),
         (XLNetConfig, XLNetForTokenClassification),
+        (AlbertConfig, AlbertForTokenClassification),
     ]
 )
 
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 9761bb461b..904666ddea 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -636,7 +636,7 @@ class NerPipeline(Pipeline):
                 if self.model.config.id2label[label_idx] not in self.ignore_labels:
                     answer += [
                         {
-                            "word": self.tokenizer.decode([int(input_ids[idx])]),
+                            "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
                             "score": score[idx][label_idx].item(),
                             "entity": self.model.config.id2label[label_idx],
                         }

From 33eb8a165d1f6b95df2d79fe24edd773d98b24f5 Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Sun, 23 Feb 2020 21:43:31 +0100
Subject: [PATCH 02/80] Added ,

---
 examples/ner/run_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index 9c24f7ade3..d44edfd597 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -617,7 +617,7 @@ def main():
     tokenizer = tokenizer_class.from_pretrained(
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
         cache_dir=args.cache_dir if args.cache_dir else None,
-        **tokenizer_args
+        **tokenizer_args,
     )
     model = model_class.from_pretrained(
         args.model_name_or_path,

From 105dcb4162e6f8629bfb58a34ae0dcce2d0d5d10 Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Sun, 23 Feb 2020 21:47:59 +0100
Subject: [PATCH 03/80] Now passes style guide enforcement

---
 examples/ner/run_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index d44edfd597..08330dba7f 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -612,7 +612,7 @@ def main():
         label2id={label: i for i, label in enumerate(labels)},
         cache_dir=args.cache_dir if args.cache_dir else None,
     )
-    tokenizer_args = {k: v for k, v in vars(args).items() if v != None and k in TOKENIZER_ARGS}
+    tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS}
     logger.info("Tokenizer arguments: %s", tokenizer_args)
     tokenizer = tokenizer_class.from_pretrained(
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,

From 65e7c90a770f574e7f776f424ab4d746eea4a834 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Tue, 25 Feb 2020 13:48:24 -0500
Subject: [PATCH 04/80] Adding usage examples for common tasks (#2850)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Usage: Sequence Classification & Question Answering

* Pipeline example

* Language modeling

* TensorFlow code for Sequence classification

* Custom TF/PT toggler in docs

* QA + LM for TensorFlow

* Finish Usage for both PyTorch and TensorFlow

* Addressing Julien's comments

* More assertive

* cleanup

* Favicon
- added favicon option in conf.py along with the favicon image
- udpated 🤗 logo. slightly smaller and should appear more consistent across editing programs (no more tongue on the outside of the mouth)

Co-authored-by: joshchagani <joshua@joshuachagani.com>
---
 docs/source/_static/css/huggingface.css     |  22 +
 docs/source/_static/js/custom.js            |  69 +++
 docs/source/_static/js/huggingface_logo.svg |  48 +-
 docs/source/conf.py                         |   8 +-
 docs/source/favicon.ico                     | Bin 0 -> 47890 bytes
 docs/source/index.rst                       |   1 +
 docs/source/usage.rst                       | 597 ++++++++++++++++++++
 7 files changed, 697 insertions(+), 48 deletions(-)
 create mode 100644 docs/source/favicon.ico
 create mode 100644 docs/source/usage.rst

diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 4adf8f7533..808f8005fc 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,3 +1,25 @@
+/* Our DOM objects */
+
+.framework-selector {
+    display: flex;
+    flex-direction: row;
+    justify-content: flex-end;
+}
+
+.framework-selector > button {
+    background-color: white;
+    color: #6670FF;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
+.framework-selector > button.selected{
+    background-color: #6670FF;
+    color: white;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
     color: #6670FF;
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index 04cdfc1de6..ac9388531b 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -68,6 +68,74 @@ function addHfMenu() {
     document.body.insertAdjacentHTML('afterbegin', div);
 }
 
+function platformToggle() {
+    const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
+    const pytorchIdentifier = "## PYTORCH CODE";
+    const tensorflowIdentifier = "## TENSORFLOW CODE";
+    const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
+    const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;
+
+    const getFrameworkSpans = filteredCodeBlock => {
+        const spans = filteredCodeBlock.element.innerHTML;
+        const pytorchSpanPosition = spans.indexOf(pytorchSpanIdentifier);
+        const tensorflowSpanPosition = spans.indexOf(tensorflowSpanIdentifier);
+
+        let pytorchSpans;
+        let tensorflowSpans;
+
+        if(pytorchSpanPosition < tensorflowSpanPosition){
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
+        }else{
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
+        }
+
+        return {
+            ...filteredCodeBlock,
+            pytorchSample: pytorchSpans ,
+            tensorflowSample: tensorflowSpans
+        }
+    };
+
+    const createFrameworkButtons = sample => {
+            const pytorchButton = document.createElement("button");
+            pytorchButton.innerText = "PyTorch";
+
+            const tensorflowButton = document.createElement("button");
+            tensorflowButton.innerText = "TensorFlow";
+
+            const selectorDiv = document.createElement("div");
+            selectorDiv.classList.add("framework-selector");
+            selectorDiv.appendChild(pytorchButton);
+            selectorDiv.appendChild(tensorflowButton);
+            sample.element.parentElement.prepend(selectorDiv);
+
+            // Init on PyTorch
+            sample.element.innerHTML = sample.pytorchSample;
+            pytorchButton.classList.add("selected");
+            tensorflowButton.classList.remove("selected");
+
+            pytorchButton.addEventListener("click", () => {
+                sample.element.innerHTML = sample.pytorchSample;
+                pytorchButton.classList.add("selected");
+                tensorflowButton.classList.remove("selected");
+            });
+            tensorflowButton.addEventListener("click", () => {
+               sample.element.innerHTML = sample.tensorflowSample;
+                tensorflowButton.classList.add("selected");
+                pytorchButton.classList.remove("selected");
+            });
+        };
+
+    codeBlocks
+        .map(element => {return {element: element.firstChild, innerText: element.innerText}})
+        .filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
+        .map(getFrameworkSpans)
+        .forEach(createFrameworkButtons);
+}
+
+
 /*!
  * github-buttons v2.2.10
  * (c) 2019 なつき
@@ -85,6 +153,7 @@ function onLoad() {
     addGithubButton();
     parseGithubButtons();
     addHfMenu();
+    platformToggle();
 }
 
 window.addEventListener("load", onLoad);
diff --git a/docs/source/_static/js/huggingface_logo.svg b/docs/source/_static/js/huggingface_logo.svg
index 84974866ce..79a9e5d8a8 100644
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
@@ -1,47 +1 @@
-<svg width="95px" height="88px" viewBox="0 0 95 88" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-    <!-- Generator: Sketch 43.2 (39069) - http://www.bohemiancoding.com/sketch -->
-    <title>icon</title>
-    <desc>Created with Sketch.</desc>
-    <defs>
-        <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
-    </defs>
-    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
-        <g id="icon_desktop">
-            <g id="icon">
-                <g id="icon_desktop">
-                    <g id="Group-2">
-                        <g id="Group">
-                            <path d="M93.7930402,70.08 C94.5430402,72.24 94.3630402,74.54 93.3630402,76.54 C92.6430402,78 91.6130402,79.13 90.3530402,80.14 C88.8330402,81.34 86.9430402,82.36 84.6630402,83.34 C81.9430402,84.5 78.6230402,85.59 77.1030402,85.99 C73.2130402,87 69.4730402,87.64 65.6830402,87.67 C60.2630402,87.72 55.5930402,86.44 52.2730402,83.17 C50.5530402,83.38 48.8130402,83.5 47.0630402,83.5 C45.4030402,83.5 43.7630402,83.4 42.1330402,83.2 C38.8030402,86.45 34.1530402,87.72 28.7530402,87.67 C24.9630402,87.64 21.2230402,87 17.3230402,85.99 C15.8130402,85.59 12.4930402,84.5 9.77304019,83.34 C7.49304019,82.36 5.60304019,81.34 4.09304019,80.14 C2.82304019,79.13 1.79304019,78 1.07304019,76.54 C0.0830401858,74.54 -0.106959814,72.24 0.653040186,70.08 C-0.0469598142,68.43 -0.226959814,66.54 0.323040186,64.45 C0.573040186,63.5 0.983040186,62.62 1.50304019,61.84 C1.39304019,61.43 1.30304019,61.01 1.24304019,60.55 C0.863040186,57.81 1.81304019,55.31 3.60304019,53.37 C4.48304019,52.4 5.43304019,51.73 6.42304019,51.3 C5.69304019,48.2 5.31304019,45.01 5.31304019,41.75 C5.31304019,18.69 24.0030402,0 47.0630402,0 C54.9830402,0 62.3930402,2.2 68.7130402,6.04 C69.8530402,6.74 70.9730402,7.49 72.0430402,8.29 C72.5730402,8.69 73.1030402,9.1 73.6130402,9.53 C74.1330402,9.95 74.6430402,10.39 75.1330402,10.84 C76.6130402,12.19 78.0030402,13.64 79.2730402,15.19 C79.7030402,15.7 80.1130402,16.23 80.5130402,16.77 C81.3230402,17.84 82.0730402,18.95 82.7630402,20.1 C83.8130402,21.82 84.7330402,23.62 85.5330402,25.49 C86.0630402,26.74 86.5230402,28.02 86.9330402,29.33 C87.5430402,31.29 88.0130402,33.31 88.3330402,35.39 C88.4330402,36.08 88.5230402,36.78 88.5930402,37.48 C88.7330402,38.88 88.8130402,40.3 88.8130402,41.75 C88.8130402,44.97 88.4330402,48.13 87.7230402,51.18 C88.8230402,51.61 89.8630402,52.31 90.8330402,53.37 C92.6230402,55.31 93.5730402,57.82 93.1930402,60.56 C93.1330402,61.01 93.0430402,61.43 92.9330402,61.84 C93.4530402,62.62 93.8630402,63.5 94.1130402,64.45 C94.6630402,66.54 94.4830402,68.43 93.7930402,70.08" id="Fill-1" fill="#FFFFFF" fill-rule="nonzero"></path>
-                            <circle id="Oval" fill="#FFD21E" fill-rule="nonzero" cx="46.75" cy="41.75" r="34.75"></circle>
-                            <path d="M81.5,41.75 C81.5,22.5581049 65.9418951,7 46.75,7 C27.5581049,7 12,22.5581049 12,41.75 C12,60.9418951 27.5581049,76.5 46.75,76.5 C65.9418951,76.5 81.5,60.9418951 81.5,41.75 Z M8,41.75 C8,20.3489659 25.3489659,3 46.75,3 C68.1510341,3 85.5,20.3489659 85.5,41.75 C85.5,63.1510341 68.1510341,80.5 46.75,80.5 C25.3489659,80.5 8,63.1510341 8,41.75 Z" id="Oval" fill="#FFAC03" fill-rule="nonzero"></path>
-                            <path d="M57.1723547,31.7151181 C58.0863134,32.7107502 57.3040427,35.2620959 58.7620957,35.2620959 C61.5235194,35.2620959 63.7620957,33.0235196 63.7620957,30.2620959 C63.7620957,27.5006721 61.5235194,25.2620959 58.7620957,25.2620959 C56.0006719,25.2620959 53.7620957,27.5006721 53.7620957,30.2620959 C53.7620957,31.5654666 56.3553563,30.8251108 57.1723547,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(58.762096, 30.262096) rotate(-28.000000) translate(-58.762096, -30.262096) "></path>
-                            <path d="M32.1723553,31.7151181 C33.086314,32.7107502 32.3040433,35.2620959 33.7620963,35.2620959 C36.52352,35.2620959 38.7620963,33.0235196 38.7620963,30.2620959 C38.7620963,27.5006721 36.52352,25.2620959 33.7620963,25.2620959 C31.0006725,25.2620959 28.7620963,27.5006721 28.7620963,30.2620959 C28.7620963,31.5654666 31.3553569,30.8251108 32.1723553,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(33.762096, 30.262096) scale(-1, 1) rotate(-28.000000) translate(-33.762096, -30.262096) "></path>
-                            <g id="Oval-4" transform="translate(33.500000, 41.500000)">
-                                <g id="Mask" fill-rule="nonzero" fill="#3A3B45">
-                                    <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
-                                </g>
-                                <g id="Clipped">
-                                    <mask id="mask-2" fill="white">
-                                        <use xlink:href="#path-1"></use>
-                                    </mask>
-                                    <g id="path-1"></g>
-                                    <path d="M13.25,25 C18.0399291,25 21.9229338,21.1169953 21.9229338,16.3270662 C21.9229338,12.5962324 19.5672252,9.41560375 16.2620987,8.19147116 C16.1404592,8.14641904 16.0175337,8.10401696 15.8933923,8.06433503 C15.0599892,7.79793679 14.1717882,10.6623144 13.25,10.6623144 C12.3886883,10.6623144 11.5567012,7.77968641 10.7713426,8.01349068 C7.18916268,9.07991937 4.57706621,12.3984489 4.57706621,16.3270662 C4.57706621,21.1169953 8.46007093,25 13.25,25 Z" id="Shape" fill="#EF4E4E" fill-rule="nonzero" mask="url(#mask-2)"></path>
-                                </g>
-                            </g>
-                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="70.25" cy="33.75" r="3.25"></circle>
-                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="23.75" cy="33.75" r="3.25"></circle>
-                        </g>
-                    </g>
-                </g>
-                <g id="Group-4" transform="translate(3.000000, 48.000000)" fill-rule="nonzero">
-                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
-                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
-                </g>
-                <g id="Group-4" transform="translate(70.500000, 66.500000) scale(-1, 1) translate(-70.500000, -66.500000) translate(50.000000, 48.000000)" fill-rule="nonzero">
-                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
-                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
-                </g>
-            </g>
-        </g>
-    </g>
-</svg>
\ No newline at end of file
+<svg clip-rule="evenodd" fill-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="2" viewBox="0 0 127 118" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="a"><path clip-rule="nonzero" d="m62 75.052c13.105 0 17.333-11.684 17.333-17.684 0-3.118-2.096-2.136-5.453-.474-3.103 1.536-7.282 3.653-11.88 3.653-9.573 0-17.333-9.179-17.333-3.179s4.228 17.684 17.333 17.684z"/></clipPath><path d="m125.057 93.44c1 2.88.76 5.947-.573 8.613-.96 1.947-2.333 3.454-4.013 4.8-2.027 1.6-4.547 2.96-7.587 4.267-3.627 1.547-8.053 3-10.08 3.533-5.187 1.347-10.173 2.2-15.227 2.24-7.226.067-13.453-1.64-17.88-6-2.293.28-4.613.44-6.946.44-2.214 0-4.4-.133-6.574-.4-4.44 4.334-10.64 6.027-17.84 5.96-5.053-.04-10.04-.893-15.24-2.24-2.013-.533-6.44-1.986-10.066-3.533-3.04-1.307-5.56-2.667-7.574-4.267-1.693-1.346-3.066-2.853-4.026-4.8-1.32-2.666-1.574-5.733-.56-8.613-.934-2.2-1.174-4.72-.44-7.507.333-1.266.88-2.44 1.573-3.48-.147-.546-.267-1.106-.347-1.72-.506-3.653.76-6.986 3.147-9.573 1.173-1.293 2.44-2.187 3.76-2.76-.973-4.133-1.48-8.387-1.48-12.733 0-30.747 24.92-55.667 55.667-55.667 10.56 0 20.44 2.933 28.866 8.053 1.52.934 3.014 1.934 4.44 3 .707.534 1.414 1.08 2.094 1.654.693.56 1.373 1.146 2.026 1.746 1.974 1.8 3.827 3.734 5.52 5.8.574.68 1.12 1.387 1.654 2.107 1.08 1.427 2.08 2.907 3 4.44 1.4 2.293 2.626 4.693 3.693 7.187.707 1.666 1.32 3.373 1.867 5.12.813 2.613 1.44 5.306 1.866 8.08.134.92.254 1.853.347 2.786.187 1.867.293 3.76.293 5.694 0 4.293-.506 8.506-1.453 12.573 1.467.573 2.853 1.507 4.147 2.92 2.386 2.587 3.653 5.933 3.146 9.587-.08.6-.2 1.16-.346 1.706.693 1.04 1.24 2.214 1.573 3.48.733 2.787.493 5.307-.427 7.507" fill="#fff" fill-rule="nonzero"/><circle cx="62.333" cy="55.667" fill="#ffd21e" r="46.333"/><g fill-rule="nonzero"><path d="m108.667 55.667c0-25.59-20.744-46.334-46.334-46.334-25.589 0-46.333 20.744-46.333 46.334 0 25.589 20.744 46.333 46.333 46.333 25.59 0 46.334-20.744 46.334-46.333zm-98 0c0-28.535 23.132-51.667 51.666-51.667 28.535 0 51.667 23.132 51.667 51.667 0 28.534-23.132 51.666-51.667 51.666-28.534 0-51.666-23.132-51.666-51.666z" fill="#ffac03"/><path d="m77.387 43.055c1.7.6 2.376 4.093 4.092 3.181 3.251-1.729 4.485-5.765 2.757-9.016-1.729-3.251-5.765-4.485-9.016-2.757-3.251 1.729-4.485 5.765-2.757 9.016.816 1.535 3.406-.96 4.924-.424z" fill="#3a3b45"/><path d="m45.978 43.055c-1.699.6-2.375 4.093-4.092 3.181-3.251-1.729-4.485-5.765-2.756-9.016 1.728-3.251 5.765-4.485 9.016-2.757 3.251 1.729 4.485 5.765 2.756 9.016-.815 1.535-3.405-.96-4.924-.424z" fill="#3a3b45"/><path d="m62 75.052c13.105 0 17.333-11.684 17.333-17.684 0-3.118-2.096-2.136-5.453-.474-3.103 1.536-7.282 3.653-11.88 3.653-9.573 0-17.333-9.179-17.333-3.179s4.228 17.684 17.333 17.684z" fill="#3a3b45"/></g><g clip-path="url(#a)"><path d="m62.333 88.667c6.387 0 11.564-5.178 11.564-11.564 0-4.975-3.141-9.216-7.548-10.848-.162-.06-.326-.116-.491-.169-1.111-.355-2.296 3.464-3.525 3.464-1.148 0-2.257-3.844-3.305-3.532-4.776 1.422-8.259 5.847-8.259 11.085 0 6.386 5.178 11.564 11.564 11.564z" fill="#ef4e4e" fill-rule="nonzero"/></g><circle cx="93.667" cy="45" fill="#ffd21e" r="4.333"/><circle cx="31.667" cy="45" fill="#ffd21e" r="4.333"/><path d="m22.749 64c-2.158 0-4.088.887-5.433 2.495-.832.996-1.701 2.601-1.772 5.005-.905-.26-1.776-.405-2.589-.405-2.067 0-3.934.792-5.254 2.23-1.696 1.847-2.449 4.116-2.121 6.387.156 1.081.517 2.051 1.057 2.948-1.138.921-1.977 2.204-2.382 3.747-.318 1.209-.643 3.728 1.056 6.322-.108.17-.21.346-.304.526-1.022 1.938-1.087 4.129-.186 6.169 1.367 3.092 4.763 5.528 11.358 8.143 4.102 1.626 7.856 2.666 7.889 2.676 5.424 1.406 10.329 2.121 14.576 2.121 7.805 0 13.393-2.391 16.609-7.105 5.176-7.592 4.436-14.536-2.261-21.23-3.707-3.704-6.171-9.165-6.684-10.364-1.035-3.549-3.771-7.494-8.319-7.494h-.001c-.383 0-.769.03-1.151.09-1.992.314-3.733 1.46-4.977 3.186-1.343-1.67-2.647-2.998-3.827-3.747-1.778-1.128-3.556-1.7-5.284-1.7m0 5.333c.68 0 1.511.29 2.427.871 2.844 1.804 8.332 11.237 10.341 14.907.674 1.229 1.824 1.749 2.86 1.749 2.056 0 3.662-2.044.188-4.641-5.222-3.908-3.39-10.296-.897-10.69.109-.017.217-.025.321-.025 2.267 0 3.267 3.907 3.267 3.907s2.931 7.36 7.965 12.39c5.035 5.032 5.295 9.071 1.626 14.452-2.503 3.67-7.294 4.778-12.203 4.778-5.092 0-10.312-1.192-13.237-1.951-.144-.037-17.935-5.063-15.682-9.34.379-.719 1.003-1.007 1.788-1.007 3.174 0 8.946 4.723 11.427 4.723.555 0 .945-.236 1.105-.812 1.058-3.793-16.076-5.388-14.632-10.883.255-.972.946-1.366 1.916-1.365 4.194 0 13.602 7.375 15.574 7.375.15 0 .258-.044.317-.138.988-1.594.447-2.708-6.517-6.922-6.964-4.216-11.852-6.752-9.072-9.779.32-.349.773-.504 1.324-.504 4.228.001 14.217 9.092 14.217 9.092s2.696 2.804 4.327 2.804c.374 0 .693-.148.909-.513 1.156-1.95-10.737-10.963-11.408-14.682-.455-2.52.319-3.796 1.749-3.796" fill="#ffac03" fill-rule="nonzero"/><path d="m50.846 102.253c3.67-5.381 3.41-9.42-1.625-14.452-5.035-5.03-7.965-12.39-7.965-12.39s-1.095-4.275-3.588-3.882c-2.494.394-4.324 6.782.898 10.69 5.223 3.906-1.04 6.561-3.049 2.892-2.009-3.67-7.496-13.103-10.341-14.907-2.844-1.804-4.847-.793-4.176 2.925.67 3.719 12.565 12.732 11.408 14.683-1.158 1.949-5.236-2.292-5.236-2.292s-12.763-11.615-15.542-8.588c-2.778 3.027 2.108 5.563 9.072 9.779 6.966 4.214 7.506 5.328 6.518 6.922-.99 1.595-16.363-11.366-17.807-5.872-1.443 5.495 15.689 7.09 14.632 10.883-1.057 3.795-12.068-7.18-14.32-2.904-2.253 4.277 15.537 9.303 15.681 9.34 5.747 1.491 20.342 4.649 25.44-2.827" fill="#ffd21e" fill-rule="nonzero"/><path d="m102.584 64c2.159 0 4.088.887 5.433 2.495.832.996 1.702 2.601 1.772 5.005.906-.26 1.776-.405 2.59-.405 2.066 0 3.933.792 5.253 2.23 1.696 1.847 2.449 4.116 2.121 6.387-.156 1.081-.517 2.051-1.057 2.948 1.139.921 1.977 2.204 2.383 3.747.317 1.209.642 3.728-1.056 6.322.108.17.209.346.304.526 1.021 1.938 1.086 4.129.185 6.169-1.367 3.092-4.763 5.528-11.357 8.143-4.103 1.626-7.856 2.666-7.89 2.676-5.424 1.406-10.329 2.121-14.576 2.121-7.805 0-13.393-2.391-16.609-7.105-5.176-7.592-4.436-14.536 2.261-21.23 3.707-3.704 6.171-9.165 6.684-10.364 1.035-3.549 3.771-7.494 8.319-7.494h.001c.383 0 .77.03 1.151.09 1.992.314 3.733 1.46 4.977 3.186 1.343-1.67 2.647-2.998 3.827-3.747 1.779-1.128 3.556-1.7 5.284-1.7m0 5.333c-.68 0-1.511.29-2.427.871-2.844 1.804-8.332 11.237-10.341 14.907-.673 1.229-1.824 1.749-2.86 1.749-2.056 0-3.661-2.044-.188-4.641 5.223-3.908 3.391-10.296.897-10.69-.109-.017-.217-.025-.321-.025-2.267 0-3.267 3.907-3.267 3.907s-2.93 7.36-7.965 12.39c-5.035 5.032-5.295 9.071-1.625 14.452 2.502 3.67 7.293 4.778 12.202 4.778 5.092 0 10.312-1.192 13.238-1.951.144-.037 17.934-5.063 15.681-9.34-.379-.719-1.003-1.007-1.788-1.007-3.173 0-8.945 4.723-11.427 4.723-.554 0-.945-.236-1.105-.812-1.057-3.793 16.076-5.388 14.632-10.883-.255-.972-.945-1.366-1.916-1.365-4.193 0-13.601 7.375-15.573 7.375-.151 0-.259-.044-.318-.138-.988-1.594-.446-2.708 6.518-6.922 6.964-4.216 11.852-6.752 9.072-9.779-.32-.349-.774-.504-1.324-.504-4.228.001-14.218 9.092-14.218 9.092s-2.696 2.804-4.326 2.804c-.375 0-.694-.148-.91-.513-1.156-1.95 10.738-10.963 11.408-14.682.455-2.52-.318-3.796-1.749-3.796" fill="#ffac03" fill-rule="nonzero"/><path d="m74.487 102.253c-3.669-5.381-3.409-9.42 1.625-14.452 5.035-5.03 7.966-12.39 7.966-12.39s1.094-4.275 3.588-3.882c2.493.394 4.324 6.782-.899 10.69-5.223 3.906 1.04 6.561 3.049 2.892 2.01-3.67 7.496-13.103 10.342-14.907 2.844-1.804 4.846-.793 4.176 2.925-.671 3.719-12.566 12.732-11.408 14.683 1.157 1.949 5.236-2.292 5.236-2.292s12.762-11.615 15.541-8.588-2.108 5.563-9.072 9.779c-6.965 4.214-7.505 5.328-6.517 6.922.989 1.595 16.362-11.366 17.806-5.872 1.443 5.495-15.689 7.09-14.632 10.883 1.058 3.795 12.068-7.18 14.32-2.904 2.254 4.277-15.537 9.303-15.681 9.34-5.747 1.491-20.341 4.649-25.44-2.827" fill="#ffd21e" fill-rule="nonzero"/></svg>
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 18d52e9f20..763b3ac707 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,7 +20,7 @@ sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------
 
 project = u'transformers'
-copyright = u'2019, huggingface'
+copyright = u'2020, huggingface'
 author = u'huggingface'
 
 # The short X.Y version
@@ -105,6 +105,12 @@ html_static_path = ['_static']
 #
 # html_sidebars = {}
 
+# This must be the name of an image file (path relative to the configuration 
+# directory) that is the favicon of the docs. Modern browsers use this as 
+# the icon for tabs, windows and bookmarks. It should be a Windows-style 
+# icon file (.ico).
+html_favicon = 'favicon.ico'
+
 
 # -- Options for HTMLHelp output ---------------------------------------------
 
diff --git a/docs/source/favicon.ico b/docs/source/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..424101de717bf411d1db8456030c97d184d4c005
GIT binary patch
literal 47890
zcmeFZ1z1(h+Bd%GZjg{xL=3t+MFBxY2_*#qu|SXzNht-zqCv1g1qB5qq*J9!QIS+S
zHod7m-^^B?<MGkwod0>x|GTdD`ksAV!&+<B%zgjvn0sdKS&G3BVMsBIj2L+3$806T
zV0K|J7=Hfm&#V~CX;>#N{{49;1{1>x8^9lGf6yI+DS;if!e>YV)_kjn7YqjW`&U08
zg&_mGF%&<yLv`rg&-LgTJocvr^E3P3j)T~=UQm9w8`QkVf-7LMPZ^B#DL0OTE2*&V
zQ6DI8ngp?6e(C@yh@c_#9edA}Hw22zngI}g3jq5Y05-rP`5J&jZ@`)fEMo8;)p5Wg
z0^3-%08l;(iU91pv<eRCGy*_X4ghBz0DSEL@OJ>f3j0u(!1`j?7xqDQ=p6@aLs15R
zF{A;O7R<0qz)S_UWq|!zs~`>7mb(c6#?tRtY5`z^?NJ>Ee23c5!Dl^42du7|gUL~q
z1F#<PVyOUt8PY+8u@drxGV!&3qrLtPT`7tHu!Z!%&?j}!Uc-E+4-~WKLAtLXotFU6
zLd@F}>VU3gJJ7ao0h^zG0yMA>dPf7>Y=$Lw5dRAGMLMh<172I83~Uf97v#xbItny*
z8vr$v6F_sXHBhm11REPyVS(7H0Z@e4v5Vjo>Wh9^1K_g2cir(K{ngEbEzdguZ_6^^
ztDgtL&)NZRD<1H+0l))`cs&4}Sm6JyHwN=&6r9}-`=fWfEqJg!r3h>;=mCfpZ_64G
zdGr=+ebNMYpbfa7-qjiam{`O+uJ;Gi6AFfJ!73ZX$J@FF6r8<)WM~rLc@Op1xCE5!
zP6M%Pj{t8cv<1}liDrPq_qWOX(LcqSGUYRP!3A}IFz`X12nR3JJ;Kb}yh^x7yoI0-
zl@Ei*z(MMtd>IV)@AQ2K29ORpkli45K12)khxZ$xy*9#k3ElY5pr;iLuRr|&c1ZSC
zH@;2@>W2&BWI_G{%SOm^7nC{kE2zeQ{UrIPjz3g@(s<Udi@@$qH^>Tk2b#lSdD0Iu
z7gm84$c<t8q3K`V0yyk}|89D4$&l_Z8~#835*d;o7Ij!q{3QHfjQ!vAgOP+7%*?QX
z=}{qoo8AIuhc^5x{HP6~E$V~%LK^?7&yVi`7Rv+{u!q3x)E%(A*o<4n505Q^`O!tN
zfbIcqt^&r_LqGOG_ppyCz|EI0tl)=7VB2vVm|r5aUuXgI*i<-=97ME#?zjGopPS=^
z?IIxF?)EtVZuY@>0**0J7!x<<!f^+SXdwU|?*JHBfUTAm%VC@@!mlpB0!#B{9Wwwp
z!8jmN0Aqa)j2E!IB#f;VaLgus0pRl@?6bPk2Nq{8!#j?3zUYcyUQh&UtKBar0MLQ)
zo$?6)m@ELu^S{p@Ft)PR06-7t2h39#YhX+_fpmAlx;-#<Sitzk1$kng!TABs5p1xY
z1@=Mh$zWU5hYr#^0Nd6;oM3gi7c4I;Al_hnoF1&Ly*meEBIYriLm*EyAMrw22(haD
zAMpx3qxp&?`x{MaIG<2K88I2KACwox#UJC_`aKVfd+acVlEQh06t=zA3o)!LeT0AM
z!1$0{{UiWHncr}8Ls^(0U3Q2S^(Vwy6mP#5REOq4awsc`q1=!*DU@#mYL8;-4{g`+
zutA+NmqMFB>_|V15dT+bL;O5e222i!bzs2~4XmSqx?+YpLvuA+6A<S8pJ*cfXs#uL
z?RlV%(A-A~?e#1E2oHhIcX|i|CDaqo3joG&U<oYE?gSHGb*rHK#E=GROIRa7`RSk^
zaW#CS|A$;iUMK@O<V_F#a=jkS-vqi4&$|5R9m0lWp#3fv!b@0ZKws!tgxa6QYJ-Ws
z?X6fa%aQ|a39(Vb!UoqOs4X4zcjQY<5H}0dA)<}iP(1&JgP>o?hYR{58MFZ{#Eked
zqBRR_kMzU<WkIx9pkApU4+>Z|LmwN(;b*|ixClf+0!y<BY)g)G)BFuL71R;3Gs2H}
z67&LbAYCK95M+nE&^(Op(fW=A`Vf*2;X(R9^3y~Aq=Ebp2IO1BP<9L0AI1>q@rd_h
zlI1l}F9q=--)Dh5kt|3CL>KX9gJX%{XOK6NiMR8cE(G4<0e=G?@VtX<VPSy0kZn*u
zCWw`wkM?i2MD?stCqy{_?1%n<UtVklj~|l|#^A!V+sjD+HbI$*AAi&DIt`+S-)JE{
zBA?@iv_&6af!4`+u*YH!?6#Q&N<oWYQx%S|215EldP8eIqDPP?#ExV~V}t^ZJyB?*
zM(F!sVfrlMjed_GLp@+euDu$EeIS3-7L5f&ix%nxJ)?WH?huX10K1i}fzj?RV4(X7
z?A_A>3^iW?wL^|Tux1=ELH!^*kU>4K(?C}gBY2^FFfOlw`RR*@=Qr-({^Cb^EpNcM
zfZ`s~9qO|I`X15)vL{y)0D@KHKvP!_?2?lJy7D?eS5^_|%4vaJ3MxQXN)$*1+yXq%
zHmLo^hHt)s{1DMXen0``Xq^FgFg2w9BX4vMW+yJXzK8mU82O>i5MNXX;|X37-5tpV
zV7J(Ipsl6`l#ZSN(!N)Lta}hp+jj`^7YFJ_hX7v_T-&04-}|o9L4J*aIxLukwx1b0
z{wMwm*eE+VzQ~}y|KR&WEJJJ7?RQFm+}R)?^zsAXff%_zzyfjc!?pCzxEvto8UWYW
z3xDulj|J#T;1A`0G74=!Gvf6p{<D)2PLL*w(ZAyVhaMmfwBG)aHevm~UcauhAF%((
z|2gzAFg5J<C;n4I9)X{r|2_W&e?$|lz2RN~$-A!GA2FaYNEnBW%is0~Kk|qDkw2gq
zTQd#$W5*8viT`M?!NafLY=m_6BW>PxJdp9c1Y|wK!M1y)K%jc^PZ&fV)`Fce&w$F|
zqhM3n;6GtN`b6u0re^^3&4U&E)aakq>iFJwoSoD7QAQ~LKg}huFMr)4(A;AHbfv|C
zuA~^)n$bXzm4FfH2JS=D_M1Q-6a%}(cR)WWCG2t5Wk7Ql6^!Lta4Z0@2It+c)IZCQ
zp5b~c(*){?432>xeSoI}4<y2F11$|rpy+TG@V}k?C;5@y(LPE`Qwu0t9|r<86aOSX
zns+fUCZrBQ_Op}mzv4~cKQXYUWE`v!!?tL>@FNDaR}!q60DNfA1@G5oTfd_5fb4?C
zAG%+!N3zqw*eMA88oL4(!Q33oaQylQ(GkOS-3v4hQ5+)p0n+cfFCn@=KCjoWUw?X!
zcymBG(7GyP3{s!R-u*S_f7F8U!EK|<_|Y9Oexvyit;7D3=Q{77u822^gGlcV(4WET
zQa@Vb{)5x+?*Mk#{QW#w63m0~AP2^nMraEp+fO`x`3&`g+9LZQ-Ur}y39QY6#aWHt
zbNurgJa){n3$Ei;;9Q32pm_z^1JNe<1+v5Uf}Yn4n)7L(ucLYFB*eP}qYcU-{h7nR
zt^w0yd*@a`SNLZrPY%QZ<0log3tEd4<`~Ee#TB$(WQJplu-1lH#bBFf$lj~V9bjQW
z>0fjDeG4!@#s%l<*w1rd@<t!rAHeyXBlp`}hT<6s#6%5!RRG$`tRBYq5dfx_;XY{=
zdl@45?SAL?9Dn@=jDFby*K2-ouTb)30SwhofaNkMV<jxDQ($Rg348%dxMKYD*g5>d
z1kbOj{C8@>@Mm@~+9LtR2Gqf1zZzV_i-ND6|I2pQ{~cZYP33P5{H=liy&6DyLbM>U
zf#r`pAv}Be`}e<718~kIhJ_0jd06(tV)MQ1$K&ym@SgVHl?=80-shj_z<NX*@%aCg
zTm2nhNQVWM{cvuN1<SY=fSX^0&%gMs0{9-TLI7G@aQ_88Zb&Nv?kD=;BIp-%U=?m*
z0jw-H6L<o!57w~!j^6L;VIKv!pNYdSEsm};`2>+ahWS>1xJR|E1HcHbmrz0X5pciU
zI{Url>I!TH%wf@kd0bOiOyIpPEM{PNX%JpP#Uua$kd6h+Eg8bR(tfz^bAao;Ft`WJ
z9fo<}Wz_2155KZB3}uOfPlUYc?=Yi!*hT{Kdk*j`gw8dPk1Jf;?kxD8`+W%4n(45h
zE6m{#%J({QxV}6B@r-@z`i=Qdf7_sp>ZP#XV_1HsgPy<F(ZhA40^Az~!Tnpu+;<`f
z)7siI<afW40k*P*x!VOiytxhYHFB^&amII=@QQMS^l<%+@@VW(U!1RDfp|AS8=$-{
z$_pX>JD{xkP^Yr64&~Lw;hI^c6y~*Sp<~1KEs~uQ(qe{nC>M-qpuH5LgX-w=zvbH2
zWu$^@VLiA8e=!8U{Z?0yM7BT50H1e3y|1-Q!vO^OuIqQ54;S<iLf*UWAN!2;eFfp#
z{w=47a)Jb#ApIiQp-l)^^o;BUuPA?u@^0(<m7nRLXI^M?Bo`yZg7Sj|Jh1OUsJD?N
zh<0ra4IXq%k9>v%>b>d<tVd@Hs4dFXA|0c%V3e~W>=ohtFZX5N>k(g+Q$_haRFF>5
z9)=qhR=CGPwxoFWi@on(&_!q9h&Rf!Qy@8@ebvk05Qm_l43H`z$IAxyfQ!K&;QO?1
zK7z0y+z0~=%sC*t67=#HypU|D4a!TRJsu%P2x%ZaGs6BT=ZVhM(eq#FVf`IFq=Rg=
z5$>Ik9B7Y8urJha{s=^lo1ce&H^2hVrDmsm-+z;j0s1S_J@P~99O&<m=eqB#)BfF+
zfEDJK5muB>UiT4{Z%1_~=l<{N0qxBhVQ!P^Im|ypOaxuRTnF;QY%~^d3ljwFP{jZ@
z_x0H@d<XT0>`V*WAsG-(WQTQm{*?cM?|+m9VMAvQ1pULYiJp-ROr_uOqg?f0=^wR6
z`)4}1XGQ1P>v~82gXBPTX<(cnoS7{z^+WtLuxtS<IN$>P4EYxm)E~l(`Xbw*GY*uu
zNBRFBc@wZfe(U(rJJjZ%{2a=M+M#iQa@Q!|jqXw1AN}C{&(G+LfDDcyq<3^afv}*l
zh_ImXc^@H%m<iY+DX9N72WT&ZANe{O`zTf-8mJE%he#%rzei^Yh#xu=BAmy4$4EFw
zgY6N1bT08TCZv1hLud>mdh72|KcsUMlhD{fIYxvR(MSG^bcb|`Vjvn12xDR|M7}hS
zVh}GZBoKdf`6TQQ+amtxtb!8qA;chv8}UN8*Y$^VNbobr59uA9+o1T5`Y}TJ*RieZ
zc>PKk+fW8H4iR2d(0L!~i+mH=57`Oj_SbceVi%GD(I$e}_+9|;1*v{@32O7(Sp$sy
z=ixor33ZOf00X?D7>sBj`=N6J5~y3G6NDF?K~X~8Y=rL+R`ed}7LC*2VMjSBgb|%T
zp|(g)WK)zQLimt<5I;2Tk#5j4lAC}5>IcO^<U44N|Ni?UR}XWOQ6sRD2GSv%Gky07
z6a&#S(h0INqJ?yX;ybcAAACkJg>be6^@z^0{>UjJyb#|X&);bxyhs=8G!RWh1Mxt1
zUH4NoR*{}i`*j)6T!`j(!f$OYa(xblXU?aVU?n;SLTymoBIq36lfZ)f8PPyFJY;L6
z`*oi{zJPeGk8eW0AGYPM$HQ}{H6ZW?o)17-xZi)11=$as`5+shE5eKHh~z<d5na><
zjbTI=)uXXUk9;5Me-HGnZ@<+Of8d8C;2b=0?+YAXP!Fix`Zz#meTWCbfczKf8rcNt
z8rcN3Bh0JFpWgwnEgR<Sy=K8~D=g4A#ezLYut4+FG?2KBgWnlJI#4!bJ7fc-3#1bi
z2ax^|ZKl$1_>mlF&4uVAIncOehtG9bc+Rx8ItpdjxQ-uP;j<ykOU~9!!i6r30jLk6
zht8AHI78zXJrm@DG?5LE97vDM&}Vrec61hCu)7@W(^~{a23TOYcam`3r#B7u?ZyJ-
zlMev@n<;_}D3+kU2qPN9>vY%0@A`a;#w<D?x&9Hx{dtITWqA%^{So{6J*-!QaS1p<
zouGLR(M0r+o{&6foFP9!KC}J~$%lMH6rR!Sk`V{HlnsG_UNtZ@#Dd)#o<Lvs8N8na
zd$g|*&JyKMUx4QfYlOIf=DKy<$j&G}uRkN6$afKb3TUfNgc>(L4gX55<Nf&x$AR=J
zj7(_Fi`I_F2a#@2+(f*Posn-Md!Sf^#v3~G5P%y64ZXc!m%JR%QIZ8Z%4$GY%^Kdz
zL3wn+E>#ntqbLKP72p}E3=m1H{&o(8=s=oij1v4F`Y0M}$hL?l(i1ft6Ud*x$NU9|
z|F_@zz_WAcwNS?pGdhz->mFoB6hqhPqwD%Tik0Zx4xLfz%1A<diV!EnC@TqeNsGd(
zG_;8XEMo917s&?Cb!DW0{7Iib&bbhcA7d`z9`Z%w0*yJec0j)G3hSWfqd)RL>wWzh
zkDK%Efd)YKLf<5;WBscuI)9aQ4<?-RA{o$Gmx8k=5WD^mp0^c4*`5O#uW+DYun)>3
zMZm9aX#Csr*dKWzSvcTWMEXbjpYK1Tb-X`c;rV%ZJDl5)57Iz?`B5)F(*5xs#T<ke
z;g<5h0X7x)6LgF0f@DKiRQTUa1JPSgfZ72w_^qct^f~M=&)a^ajrgOr3>u67h?~IT
zXWZ-0a364~4dO>*=HJDSY`m%D3lOXrCE)zi8T5C5Bg_Fv4tNeOl-C7)6P+9XwT*wo
zzx$8453JVl{(J=sGk#qw(D&ikMPmo8`Ti9T^d8N#Kb}qgWgjT-Z)N<6AHjd%9E1Gl
z0Gy-0|BQdez5Wc>5y!@1BlH~<iaY;u--xiFdG#m0|IRa^AC&vA+9Leun<F%So#A-?
z{!?AY`|}m7tY|{@gRS|{hoBA6Iv3gfmpmYEf!b-XBdLJ!9(^-`#=<Y#ewPouCyWc&
zcgO8lgzrxN%5ErDqud6<f4vI=TV7m)e>eS%d;J-{<A!Y(4}G^8Vg39Y{5<g61GzJP
zV7J6hAQ_whzcI)HI}%?0);EyPY)>f$qBk>v#O1q$c%|s*2Ic+st@5wrM*fT9KbkwM
zQD9$ML@R&Fb-X`c;X7hD2AUHGzR8UCKS&3EqK$M9&*U|=cL5z3*U_AWzF$y1WC4Bd
z`*+ko`xNY>Z0P`Wp}eTQj-ovL-bNj4Djoc_%usJAcY(ecKzWe`2o9{Rr2UNf=VvJ6
z#jnsxXfK22aP*DBpYTK4d<~0)Z!1ttC(KLmJ0At7bHBy{WkLQf=Nd?m^V>Wu4d-k_
z!2f#smp+R0#tPRH$aieuJpcV?{xja6pJBV5Xm5$W=|tlm(fE}O2y4x>Dxk4vFJaDA
zGd72D@zd||L;pc@gQ^)^SHL-1Q+E&ar&qrpGjP5|`7^X;ESrF+;QAQ;{g-F4uniHw
z%@tKo!}SyFi}FUlv;op79FJ%o5_<805ZC`zTtx5r8gTGCsgFRgdg8a(k-pgAnilC#
z73Ocg{noDktn1%B!}b~|zr7E}Tx35cxOPUd>`#7#d>ry3;Qrlv)aJ(m+Y#Ese*aXz
z-Uhx$eov0ppHP2AqY%yN@;v;z`FD8#Tn}xqyqML!1hGT?qxHo4UjCOn*LnXJt|;c9
z{14Kb?LYiJ^v@Xox(3QC2*>htav!7x*B@x@k8(=C8Vmm|nUIZ<t&!h|z&U6fO-8G$
z&`SZsU!(o?TiEw7TC=X_wvbK#e~|&=CSXT%71R}4zt>M8>??~f->>*<bpKv0jJd&(
z5;$7_&8BEy!M>iKf_(p{x<|Ot9)qwyhuD$7zCu3#ZLNRw-$VKPR*(=8%vIk&6y2b2
zBK@Eo5hI-2c;K3Z;7|X790)fmXnl|3B=UEDIM-I9DESThpT33sJ<7j+0|`PIRzeWv
zyZztt0Ei!&-<V6k#R~!+7*o*omp1v1mmn{U?I@p27^hGd$luW!OCMaj!*jq@=(kRP
zjp#pn3#nVcF^%nq<H{0_Wpw6+<Urp$p?o{a^`Y^Ba^EO_M_9i?tjIsmS`g_P<ucL!
z7TFN>-SiUfZF?YP^jlf(hJSVbGj#u6FGzn2imk}TO>+R)!Eu4|GDwFA55k4yLHlQN
zSWnph!`6her0;D|UO@`_-wn8?gd~v8(RoTNy!)qrKl1M(`p;THUb|qfr3mIx0Tw=F
zjll2fpq`aWzxgpUj3MaXmN3A=0p-{Z^CRfIH5{(98)sn$^uxg%^2#!l3dsIvDE{ZY
z2<PZ4OQ8@8I^RYSn{YCXWEzEK92PWr{_$I!ga1Nfp%(u89&cDD{69zV-=P4p5W~U`
z%PxrRI9zW9f|bQ9aJ*cB{^1YpbPT?O>ym%Y`~7#g|4rp@4g9Tvzcui;2L9H--x~N^
z1AlAaZw>t4(ZHpBhPrgrT<G7e(e2r-eE@wejS3|>yjYxl=zyNw4(MuPo;Grizz6wb
zW_uj=?!|0__mmi-u;cJQzxmdQ3tll8;#A`G8X`<8$v^JVOKm}UxGPu3?9tXZ=tVSM
zOp)}!)EwvU;kmqE_578{=KGAA%+3#~iggPq1*wUHsJBs@;7wTftGD+s=#WjPhX<t2
zaUC8G=54(>;hW>@>EW>m)^7Xsyh&DC^zCt*>WVLx@=0k*DmbGsuVAFqByxt*a`<R!
zM4%{nA;*99W+Woeye)6^bAnt%pmDlk!<H-Meoaq}T$4j?TBO9uoQPSplqV5yX2Y}<
z-LxH5O~{VRj&#{zD`UH7zT2@xik9hoM!B@rBBTAn^#qr=v5Uc4!6d=S!8|ghi3Qy{
zomW!D4cXHq$P0;W65M$sRd*jUWoi_*(9CD(;|?-Nktj4!`H&i&osu1!DxQA@%Tlp3
zIFV{2Bkc-qqC&b%-4x?)BV0bd5E1Ds(pMf#!8p0sOe;=z_@KOD^-D%p@}5l>ZelGf
zk|{&%Rz2=iBAh&}0^)1uAI~sWY@(E04I$mTI6|-MRK^p0hUMv|3(+pyi8X9nLfRq&
zpJL?q@?a{P9t^KAKNV08brG`NBCg!9m6R+<D|C!9h5rJHt&3c#r92(SFm-^4_W5(%
zS$lwSurRaqSks(pCx*U68^_qE7#v7nBWl1z-ss++=q_D3cmAWyM2dzNbKg--b}l>h
z^nOPwum`7Hp?=~H&CPJgHg5Bm3f$Q1<@n$s7G7=nEBOv-Q|*_W%P#R{EhXzjJ2c#J
z-a&=+G?|YT=E11lr=Jp96%OisNB^)XGPC0N=C|=-MsZfB3lp${6*s92oA&Z11zpmm
z4;3#`3by;CC0`^GyeYU(r6WK_rT*k+${Y{j=>kr}rMleGQc3(t2YN>)2fK$jB}eh#
z5}n`&9od_M@n16q&G;%X`n2RG+2?HprpdQtS=za!&GZ=ud$Op8iMu3gvVRPQ(49yG
zZ7-w`a=$N)Ol2=!bKzE%e}4UeLLQ?ZcZDkJmphO>^#EV&8}l4D=D^<PyjrK4-dP>V
zS@s!tuQ}lx6MOu;&GKzBO42VMd(8#vOXY(@Vom&pZdcAmvjrYVH1)|k?uBEoc*R62
zx^J|CRAQeIHB0?uy&nC5)^s0Pa4d6gsBB<@<=&Dh3W4z@#lDT=YkS#ts#Rpiy}y2G
zqPX+oYFiRt?T1s3o1A2KlZ>V96So&ne7=&+8p<)hqcG@Pc)s$6o)k73_UvxPz!tww
z^%$Uyb0j4SAJxp@2>;UN+%F}=6B)(B9gk;Q)to;SV5ekuWO3()&weo>FRg7QL=*5F
zX%}jo%*I?JRYwfE^^UO8nm<gNvXz&}-w^2HO<Bjc*UP^8fXnL^)y`ceQ(KByG@9pG
zas%E&)e#>TS0x$1&IZE~AYedC-^Ed}&{Q@Ttg<6P>G_Ka?1G+q^PBlLHdabAn#0>Q
zr}&9`1xd^AX__tJ7-Ouer7qA-5N$q&^AFDBr<`;$igDZ}YDsjhxk+5LeWR787hRtf
z)t;Gq+%^TP9`8y|F1b|a^3_}t)~3s1a7`m77MR_?``m0}24#@UdqG;>k5BJZc$~#=
ztmxT9IeGh0+)2k*g~X|(iQ>Tc(#%RQMYMK<zwb#?$|Gvb6*Y^`!>&eWSIW;h1jjU;
zIXkj3UGN1x>jgJ4n^=;YTgF_{cX(qMwiv|3ne;VLPU$d4Vn2m&(;9HJDJ&VT)QaQR
zoT{dnZtFM^896B@dv44&=vbhTJLqz^vVLhnwCiG^0mt0jq0GQ}F?(aFv!3dFW5K<%
z_wg*jWkXwT)zKPu5!*J|oNSin@K;qU)xez#df!lfsZx9ugu1LKxeWTo56E0iAsX~`
z+xj-jQpzCC$At1k3VTI?LH3S(d9^)D8+QhHvUCUVcUthS)n9I5!^o`B&8mfdD&_Ag
z9d!NpRU++y%=8p4opBOxCDd?IW^F`WBJ!3-QT=4y=mC~9(xbyVF$3+Z4kj9Wl4fe<
z-eQjC9hJfd7A!a8Rf5Y(_inx!AG07mf7<a5Q;p*>eJ>KCZ4ts>@0&J{y!r%~YJ|Ci
zYpRCFAGr*cXL6P*k~x;U4hHn(7dGiCr?eRj_kASF3w)MW=XZsrc&;)!H_-7mhl9zc
z+HgH(is7eRk?*^>)Sk*c+!9aSw_(gl>EYDd+ULI111Cey79R6b6Y;0eaM7<*s80>P
zF~&(J4hGZ%Zrcq$Ou)qUbWb-Wh0QLC<CCwOZW|j-e0?we;q^DRq)pT<Vf@C6jOXbL
z4$PG$`K~qwx+Y(IzZ^&Art2`Ic=l!26UK)WZ&JCAvd)B8VD7jCDH~qH%a?Ms;pzFZ
zR&R^n)+o2xV6Hm9ls9nua&l^tLi=$x`brw>2p#Ne6RB>)iuVGf-JhMwrG%1IJa!RB
zGUNsK2&`@(nlcuYuZf*L<Gnp&_H?3hP1A;Vug9OZO+C20Mj~HhGwG)(*5u7yJS>@~
zBC^ThnmO4AU;Wa-BSP0B3)mQ`1#qg7E>Z^Zy3aVS3UFMV&S=t#&=}jK-*(`G_odfz
z)Wj~m&At6S4$oSmy|35ZE{&k1Ca%wyVrkq>gqgM0^8`El_U}FS%Gg^CZ<jWMos1}W
znXt_0VSirI<ke#N_(gI9oF5sc$+7OBxP|@cw?hlLn!8k}AKmQQ%6eB_oswJivJHPE
zKSwSzD|x*5as3C!i-J-;dGUd<lG3=nE-%uelaeJb$Y}PqJ=uJF^7wPnq{)fhPu1!H
zv0&+yY-xp-lBr6Ze-%?4U#7VACyvs$bC0JLoN(m=@7gNov@8}|WZWbj<gCt~GCHFz
zqPNSFQlesNe+^w&HZ47$hCJh(NUkx5y=qXw80Q9?`s5X6EIEB5$u^GRUhEJh@io7}
zfY0Z4VIs5Qqn7C(UVd0w_sKWNIO`04jPdmH_Ha+WV6y$fTXxnfxAk5h8G62rz9uEZ
zoFUec%db%Uf~{7@aI0o>lnZOOV)!8!$|Cw%pU2y1KL#0R(wciNRchsri)|aDdOIqM
zU-R*v+fEs1IbhvTFYFkM30QjH>N~}+tWdDg(~7-<!K%pu|9<<x$>ifK#PZ9V%coZ{
z2Bny`bGHqQ(xk48%wjjc=%Bv1hBKz;tvaC2p{*85>=Be|_nbZOJU*-coiES(y;E}e
zfm}tT0=-j_VVq(Z8+^uawI_?{?$L&U9Ri+7{-1j8yvU{%cAl(tRS{Y|8q;`jgk3%9
zBIEK&N*SNngU>iOh#cV6oR?=0!?KKPF>+T+hMSa=k_A=<u~2>zinXOHx-?QLEFGb|
zGr*LeRJHkJ#mEN#T{M_D5byq4jX7rXxyZ4(GdQ-S3q3Vm6f({kCISgdUs@<C#Vjxb
z1NCiH?~MnI*tc<9Bc7*9(&pd}b)DM9$o-Ka!DVyd$@7AhB#xULq-NaSV*7g@7>z`@
zt8c63Bu?{j5Do9+q!L$4;_fzZ8Mmf&PCnAjToHw*bYmq>3V+wc;m>%gl(Oc1{gguG
z3tx$aDesMsm^6CBdrxzRQe@Xy+>&O}xTZj*64gt6q4c&<Gxw@Yi*=MtfOdkn_y8#;
zl7Xx`*`{$SwzBV4^Kxlq7}&(7dE(HRW272pK+Nj3C~?iQ=v&vWo{&Lt{zIc~3N_VM
ztE4N8q^|O^L=gr(>T03A$NU=&%Ee?&Qy9B-gJK0pv7}8q;`S3`22DRnc}1nq-&$;M
zYzo6ZiM@0DbclKM#wQrsr;Fw_*3Tc-;hv<5i|yS)d375lmSOx;9hOR`>-n{J%=fZq
zt=|Zn$B7KzX=?3r)Am)>w9!3(qcKXsjucZr^|r)+a}ht)L>~TBb)6cq?F;7_pR$m~
z;_4kDX7wEY^jepm));B}8($ZU7EI-23;v9wpg+n~Wq#4*H9M8KX6cg)tb8^XEtK=b
zd0d!7h6G7v9|(_($33yh9ym-o+mwD_N>Di`_pIpGwaKgymld^xNk&hcSZbql#HZPh
z9uPgImlfp`qQSm}o#a49kGVqlmYxt9mxKOSbY800&{J6`-)F>zKX=E8?oYYIn7>J7
z&rr=tBDdrYj<^&ePn>7PP5l!d?i+hD*1Q+S{E26c-X4Fs<ixOafpMt&a&=~BPSs=c
zkz?#><Rpq@x{Rbg;#3Qhca5&LMeoE>Bnb^`1XxcmFg6B?w)U-P`zmVoKAqc~%$Z|O
zKm5?7Y<14c9-kg-QcogZ+Bj^}MEB&%V#l(f$WHTJW#cJyEdmUME?g45S8FSSdCW+3
zu}wP8r*GW<oZf3PyiyeSsG?^;QK2}d>;+{YT{;s9i>PA33l`Bw4V-1Gw1or2DMQf?
z#bXiTph>dSz*=Tv{9@g)IStw?(N1d`q#EzL#B`Q!>0ApklISQ58>#rLG_o;mD)9w*
zQu#BZrdojurFj~gC~?v_`j*?FjGZ?&6we;rs}G}W+d!bk1us_-y*yv;L4Sc0O^eb~
z<2S{Vt{#4#^h{N?=IJ{zoyFr3*UU*idD_wSZjDTrkTKr4{PA<hSW1L-Z%RVE0FL>A
z6O~P<fm+;C;sA{!xUUD?U->jil%{kzziZ2S(^D(mqG#ae81dZLRgY$3m;5P?YY9fF
z8$J4li#ML&S3Hrp8O~*nDp#aZ!HalbQoaXYh6G6C4kTDLZ}u?sA85vpH&dP2@#GEp
zYEaLz4hQ>DCyz~0#@@5j*sO~2sR_f0=>EbsL3ynl2BH!>CE~ziAr16Yti&07>=hRQ
ziO{v86RRqzeM3ydn{TU%KY>0RwUe(~r%Y5VfDVKX`ipP{xY|n9^O??GpKoR3mJ5j}
z-}Lo5nPprk$F-5D*4QKQLh_+xotGKSyw6dZ7MF4gU>zv~dXf!06J+5)^Op`^6W6$?
z%qQ!g+K*Lw!@57Geu^kzyXkXRid^j}7e9&5a%~ur19@6OrzitRaO4Rp*S8)fHlpkk
zn5ty9xYlaZ5uzX36VF`0(r-Y6<~`Eqs`_Te;ki#}t4kIxD6XmBtdA<_tc&dDk_P-6
zEOJSy^+!vRL^-f0HBG~(v<1d(clY`(ALPp=Rmm)l>BDHf&HWr4sc_F2E7eA%!vo{7
zpUzY|C4;E;!?EQo|1I=tF_&cTEhXbQ4_C!SN%*I1DJ5+U;S3`>ew3lU$n8XR;fWfq
zeUwRSZ34<zOZl76nCk+U2C-Xr1S-|tyWPz}drOhs#*eK12xWDB=@dn@j%sCtq*}cy
zpK#M*%@i7+!up<qcJcCrg0DUznl|(1;u0G)q<kknURUbbSg~lGc=*!1_NwEs+*7u_
zd)*cL&*fq;0;LAH5X~y?545v(3w*nt%Sm;lr~4B{mXaF*?hR4y-ldEUZ9u57>QPsS
z`<!{kh?{glTkxaa*6BnLGb_+@E1{oDutY*;X5{kwXIhy1L-*9*)>1AVa3>vVG`(Fu
z^ioYXOz?rR#-Md>(s9+AHA^G^ZSNcml^Z~1b6w1-agj2)Yl>Qo@7b<R22MGYH#gny
z)p{{AOG~!9IllQiw6f#;UL3Q=khNX2K4@UKOOW{bo_6-c^3KDT@_Ig3#)VkQOG)&G
z-mT1J?@Xc!)Hb;`dEP}|Wp$GQo0=e*r%(7CWpG)Pv7phpBtstJRiW_Lj(*SFXA?8(
zd$d{FS7};SEeBJa6~~HZ?uQ7ls&muR-d`bg-%|K|E-})8;f%p?_U^A(|NYL+Nmnm!
z$#--frN8SJbHmEQEMaWLe~!FnRc}K34NrWEM5%+n=`8!*#Yk_?5KLoFj>#ckV#*NK
z=J#Jj4)l1NAA4Zy&9yh=ss;b8*D7Ckp46p{1H>~+l7Uo9-13zDr7m=>p2mf3O^-_I
z``CxEqV5v=B8b@D5LIm8e#5H}^Y~yAZxK#T!D`1FiSTp<<>)(Q=_evoKfmX(BPv@B
z@4%X9s?MgA+0H(R&3=+cHTd3Ds!F!IqkewnMHiwKMYJ<;N!g-$T;1Z>Z6_++gNKQD
z#o#NzJjL#w&meDQkNObf(INF(O3c0)w_suRyMr}q<pZGFJw2s<>S~D$Wxxl+`E$3*
zjXT>X>qKe-O!_s4?Kmp~?G*(|49ln1xb#L|-l86RKl<>Fx~>c90au>zB}EOUE{>OW
z3cewy5~gwK!Omk7Q?^e%C%6p9t26akN3YOKsj5hcW3Ja9ohtL#(NO%Uz%W&cX<(mC
z!%a=E;jsPE_r=Ix3Ww@SvErOvb02ws64_9e($;lhQ0?WtvmT|6mBQyP^L)6tCrp~J
ztnJe={G&T*$E`1O@tij=Z@uX)KlOt8K-ux(o$_gG`34H&Um5XS59G{v&&}8+c@SX)
zZyQRbSa-Hh4LE)9bSrE#3bhX1UhZlsrLRh}y~WqN#-=O^^W^Bm?k23nxfiKdwM#_`
zh7v`b&Qo5uif49xK9FPrLxz%YZCOd#MB$QF^b+3d@)=^qad8#Az65xcj@!(ix$zh`
z`9XdaT%<{kLzORy$d>yVR8f5PrDa*^hIA#o%W94mAB<OfbEz>Om#Hg!-99N7MMQZP
zU45T$lxjdCzBJ{~;q6qPM@lSmJ*+p|s0)absys2H0>aI=%@tWYr;nBFXt*7C*5$<*
z-!0w5E{^oQ82+_)t<on~h@0<!OmSXrv(p<p<OCEf)2)e2US%DSNM*2LD%rgA?Lq$l
zt-y^vuM2}V8tZurT)>9k3ctxVluL!-ntMfBb%K6;Rq)0fkGR`Ww&+%Ui4V6v>P<mE
zYTiIQ%vu$HYm~udtIJ6St-zij`goqL<mt3{N`Zh|3M$uLTTVRTrYlL}_7{~iZSnw2
zGku$_0Bf7|h%!TZ19R<~h3B1!Ko48>W33k%vF=WeF$ee?sW05RvI`_LuQ&^iP^#^8
zIZTS#dCn$9jjsRB?W+$CEG*V8`8_`zh>43&aZ%eD+_E#5oTQb5g?M^}@hO)9asOvr
z*hwYZfqe?AqIxYO<sxAs>=@~}z}y+mf}I`Ho&zZrs|6A@CE02*0?8}aTOavK$%ai@
zvXgw}aM|@HsuIVEyY)yT`h7Q_WaOIKSMM+vq3f5^D~!(`7cktG_~P2R)66f_UY@7#
zsy}*}B=25>h7G5ETItmU`!^Ob#}|FsNfbD+vs2q$C_}661kjf!f`lt4JO}ON2dA^{
zahnH9zYu>w96+vmYw&B7Cw)0CZ<B<M1+`aZ+u6PTifJ^Kv<9-|2PL0cnBS7$o~nfv
z&oE;n{^}yK|F$LXlVXXqgP48u?-^&<^?PPaO2%HYhZ;VNNq!dhHNh`4=frYe?6$%J
zHk*JuWs>*R`Q9oXiyq;=^&uHtK2+!Kp|JY#;0iIb{F9=0A`(uWzTu1QUHKHJsry73
zg(}{vS1q;?>2usXSByW_Ia+c?nEarupXh0-x58ux9V%Bkg|FF@Ss2EQdnGEyNfN2w
zw3v^bS~6YXBiS$VvFB0w1NVS7DVS)rc`T7SUvN#wk%Xo-6BCx}mvI(E9dKUWcI#LW
z$qu268F^ztj8n7uL7|>H?_Oax(g%@i^JVmbII^~~>2!%Ul4o=KhphaD+Jj3*-BOaN
zY?L<kX%yisl!Z9_c`k0_398$$KbdsHWs@?ZOw26{1?8lzp(26Wo`b<-Ge^%h`+nq@
z#MQo~;PzoAO^J|*5zUmxD@J?XzjXViZQQ+dfA`sa`O}SoF0(cG>?s?gq|7)Hsm#M?
zl`SqTw74u4GlZSXFvHm=476P$;qv^NpDu~JWZu`<u+|rPB(!3$CtdJ_PkH4VbKH5O
z5Fr}FV4o~CE>R-(*d!n6dQO{2Uv7JM=N<>TcIr5a;>L>`7$!yXEDoJc>~7khmsNY+
zD{DvWu2RYJ?NruIbI(6FZL!3pu5o6Dv41sL9e#It!H8?+o`}V?oGEc%Aq!EETDHF-
z?zL%5aFqWk4g1xBO*#8dr`K>FbGh-LPW6cVl(tuIby(#rZ;qUW>$JcHE@|q)xP^T$
zPL?bbd-wUK9N{jx$gk|OTg23MHPd&7>GXVakB=mERl4#?Yp+(Vg^=<B{<{NYf+l6m
znJro?$F-<boXNOfDFrLx6@vB6hYrvhg`SzLz9W#wrkjZSv?pDf>#p&D|JjVuSd)_4
z)`0stYa0E9FKCIHnUm85sx~`hlEN><4>O(CZksP7Elzkz#!tc>*1W>WtUP~br9w>Q
zgriw5rh+Se?y1Pb4pM6YQqRfBD_*#Exxp}|9RaG?eeN+4W?Xj~gc<!7m4J)ayqjn6
zilux0Y)HU{5eL#T{AV*)>U`pT$LS=w{Ko0zpP3h#(3W1rc-gt$x>TC%zD)~bvl#3s
zlli!Avszt6*)1M_!85$~!e1y_*<3Ru&i0~qTd>A^ntKUr1c}+LE{`fbb9MLf$e-)l
z)E8y-QYa}W#+Q*Ni)idfxk^2_`H7;EZ?!VP<x>p);fDy0QVn`r4h``D_0?#{Dat0e
zN|(Wn%Lk?$vY=NC>L{_5VsAZ0rSE;AGx`io->P-P+1D$_IBmW7HG7?`f<2VQn9NzW
z6A!9Iar34m*1d~LPno-S-?HLqrlE2h-(Jwju#>?b^we!imaSXu*WJQ7uEyyX`4#&y
zFN<ztUhlD|2`vS<v0QeirNpU+?cnMuU-9WTnom+peV>Fdd5tuH63L>)oBG@6D~evH
zuN0)!@h8i%e>5;>PrZ;oYZ=MGh^_9;UD_0J&tP@pJ~ig?qD7F!Crnw=EMA0hZ@PE%
z9&M)XJppOUxLEd3Pd|HeX*c67l9wO%M3EQUov_BWc|M`9&8B?Is<+?Dc=`0gBNuFM
z61EmRS3LQtjm1^s`pk>8b1`_j%12ojD42=Gl4*8l=-ep4zhcg|_4F4DwzL+x#NmV2
zId*-^^6ifX^r>q;Z<Z7a3Z_c0pLt;&n?D}we7V=0mPC<?zxk9?VV-}$EJdKh-p@s8
zPI$jrVYf;jwT`}zSucs2@umEv{uhS$9sGUb4%t37>fH$n1iKpQHQq)ZITf8Y{&8?L
zj#qn6v|{@4TM_SDZ}qVB-iyBZ;p!e1%)#a7&Q2EK%8p2mnm5KTvBFQ6?MA*TQ}S0=
zIa_@EqD<y9Rj&Oxv{FCJTIo*I@hX2U%*k@DVcZ?emX)1(48frAwCiwXVviwXR^EWV
z%a-y-9alp>L{6~xrRZHhp!f0=U$tjyPb%Xat7@8f4XL$Z8YVxcu>1v`%@=;RyC6C^
zL;j&v_(BRvkP=lp{?kfDdT9Z_V&O+inVr=tE9WbOOB$8k5<~P~jd6(swX<s#F2k?K
zzxY}-v}@d%X~`0$U0JX>Xt<?X*7(tKzN;nXOLyJT_mVmfT9uwhe7MNaG1NMYkH5nE
za_qzXRF)jF7l*Rbycg%a@MrEw)m_%jSh&`q?6G!eOXR1H$xA~d##Z2Bfvudu;g69U
z^v5)E{FW1ED_gWW-z!EbG7VR(`S?_x>q%~^ul?fWWEAr0c}}b?lfK@TN0Bd6-xW6K
znnzP%XmZr#lr&5(=VbJycWjBL-8xWaQdg7|96PvW;o<%8`^=AJuXZ1*sp6rC+*?{L
z@4KPR|0bDRb7rFFyyhyU##1Hchp~orJdrKJ`ivh>vm5#zA!@$k6z)5K^CzNMsmdaW
zKG<#>(8IX3M`)P(V{Of<a9@u-Re<pR_cep1JtjT)N7}~p4S8iD!8{K5HqXkqSDt+s
z1C^ZI=#sWaUSEdCT{hM^8FSY^xh2sjO&M@RmM+*fW-2CpmWu6xcsskx+hY|s&6~BI
zMVc7)6$m5;3K5q*t9Lc=JH2M6cgaZqQoCN#OBzf)+biK7*Jp_%Q5QWoD*_H9IswwR
zE+6#jgl~jyD0FgEODh=rAk5N!;ZwZHnDXYXwo$#c$%2%~?QDae#~-+K*y~pAxYa?N
z^e&y*tS;^3lDMlrTbEYf_M+5@u;9b8mc!={$dfwJJ}Z~pyGM7|ET49x2+t-NQO9k!
z4JBtC_wJ)~mo%upPJ8d@%bflD<NG&Om_M3&Y~ZHg@Ns1)s~|r6sLh&-m%8h%cio1e
z7%z@Pw!PgLLvCB5Dt3KuwX~!w?-Fah9=lt;&``L!M(NzHEEmZ<XdnCG%~IY8BJDjJ
z=kwg*zX<L3;HkQXzbrYLDpMOz-Ln*OBc}F!6jZ^<q&JM1sHP{u6oTAEPr6pb`j;gD
z#X~DjdN-QHF7}Hg!shW81W%txuD`K)`4~-<vG&jn{PU2TrE*%^?UwE`v2G`Axg#y1
zM<HsKm_D`bdhtvD^U}x5tM74;i}Riq+x$-a$-Zq)87jCOiSbos{iP2pR=eswuollp
z56-46&u5<oR*pOo+dH`AJo=5IFFvJKK0_xz)_$F_^5$&0*7g;?gz{^ZsWhGZG8s41
ziE_wAo`h!G+zch!{BibN(OaLKcJEU5*`t=-btWl!hcNpt%~CoJnK;xsx>;ZG?$f*^
zpxl$}gm+}qk+Sw98azTXy1PcT@v59>aY4Hb!-YJLB^P-|nT@(#Oz$tL7b`d0Ro@-R
zec(!V{ItAAkuA5^%TFr~<>mn^mSv33E7t^~Jz7lji7GHACqIan4NO@wJrqmj;5O+g
zmDig+^JypM>3!<+YBiND6Zx)Z%C>D2uxENlGt1zfx>)-15ckY9wvIkjrf%0+s#4kW
z9up?EVfAnx-0g)G3ac1BCfT^dfz9EHQqMsL?=N2|mNt-C^_%2x+3qgsZ@U~4H^@*t
zbw{4f?8w0fme)AWS-(0ful%J|tFLwTs&;|ZW}NdZ_Ha`2t3eKmbn#Q<hG}H-v>XR8
zo7>;(oL0)xKmKmh0YMp-^l1xUDTT-@iD&Ric(uZueua&RX_NH!TNNzvZ>?*I*Is$O
z+3ht>vJmF!f8lu7+ao5XoXa~n@GrzVH#`lpy%lo%#;n#tlmBI*>K<H6mzPD1)T6iM
z4<esr`O2riF2nH9wa>>#6)K6oKc|pQ+&Hp7<kKPXwE^CIHH!<4E#jV7ZTms7E^)P0
zwj;itQ&W3BY6*1TUcI(v>!eyj?z!8LBAbLlg2de+_KDpyA0Be;AhGc=3W@sL4V**@
zFHL5{k0)YMj}Atw9kL~fc0ab=F-a{sVzEb-x%eJY&4qH7g<!D@o>rap&c%1>G@K=p
z&CZ<fNOLJK7C$CrpCF%-#c3T9BRB3~+#=_v%zC;Pm+6}+;k8`37?NWmg`aYNqPsTf
z+jt!tn$p9pI)8<X@+nIno}HJPp;tRahw@El)Jr1$^e_jt(3?>)uA?nyRwoWGFl5IN
z&#E$TZ}}Xib>Ox)ot-^9(O&lKJ!|cG3?~*iV)R=KF2T>eYpXt4dK_q8s<JTbG<2e2
zy`1Qvr;f=K#xqc!zQo1w9<Xi<Fq_-wF&Arhcm6~+aatfIJ6e8370i?M&4ZyG_D2Tn
zvJR3fKI_KKH1s88EVQ&J1n=0ib(awD#paQE>qp1FmbO{H`!u+e@l{h!mNe-hAHMzu
zRu~j+TO1ECur8YD*jTwHc8uhMJ2OW1u=P8WIRBofZ_jCI<MtH=W@M-}?WqhgDKYb9
z-B1>LHph8Jhh9iFUSDXLY*pI9FAD#0agExR_1MnzVTFkYhh@(xV(b=Ah`U<*O*9U-
zpU8E5b5Z4X1o^AM@?L?crBfO78Z!~Pfp2hamzJKCX3S-}y$e+yloOTrUheRzG-eFf
z-Po(GTPRFXI;fyz;}_>tC{cRKo};QWyP~Umh**Ah;GR#(^^NzIgTnhRKi}O@bS52>
zzd5h>VxFmTvR;)Y#+HOXvn{XmT^`ZiMH-_XHos`8@!UD?9EaW;ecQ#9xvi!RR{5N(
zL=TFqn?_tQRXd}be{EBFRL73#j&aY!)91vxHr!srsj!tx@=T{`ZY;YW;TI50eyfQz
zZV!Ijr(nU+d$(NeeOK?UtPGYYd$CkCnZ0z!^WE5o$!)5=e&oT+lGK!rZn5(pC$X`S
z(@m9jzH^<K`D%E<@D&ED=OH1syK#b7I^R+6y;;u~9x)xU+NYlvoCDLFRW-t7gF^DO
zJ<ADK`r^5xRd+J%YG@X-oYQP8mM38&*GZM0oAvri(QWl`Q})&jhi--bB=LO$8UC)P
zcdhu{a%g=ed!C=jynrmZhJ8Wo@VKohH*IX+mzA!(q1xmNMrGsHMI5?jR}Q&wo%rCb
zb1{d8@rFP6c%@G&$u~QWA;iM%D^UqeKw0K9YQ?PnOEMJ?*mweO`yINEk>QCtX&;_X
z!dpfW!!U$(79bjtcXf)uq;i=IcXf^4_hor<beo7o`W2E?jR=9mDl6|*3qPu5vJXr8
zlPGtnYRW&gP~^74N+%r~W#;tXB3^o`Jz=6Y>s)~Bo{!$C7#Eq|J4$t=0XdixRHv9Q
z6Ouy8&+IvEm6NLGJoAkA5A0>*32^dHr_#@SFdgFF$ubrtlDK#NoY{n8j-sOaIr35U
zq{%hws6N&Clbj^j?la`L<ocrldny#F_qX=EG{%T~oxvnX>%P2|*JI_`=<eZ}+J%Yu
ztlP#$vQ3su{jKXBYE`2;(XgDld-toIVlieZN4oZtmbng=)Hz_J*R*dL3!|NFnvMEd
zkHbD!FTV<YDZ@<P6Wu!Fc0lk!)$v%&>#XMMd%a3sgQDZrWlM}GZ1i5daxLVD+r{A$
zX!^F<g!h{D+jxHKKG{n#9#jE2Zgm|w@fGb)F;$q|5lKf5FY2(9B!>|dWJZ0RI2M_T
zX+FxUu<u=H&SR-F$ENjq^7Jv8bvyTDIZzLioh!+OIf8m=TKUW?iJ!h;1$)=Tr3Ah@
zl4Nd<y>mC|K+j$?4F9F15sMo~Y^0B$?J=<$-s<||mbK}gXB+gMkCjXwP0bO<>|dmd
zG2J61U-8`WNx)H;kHu1?7`B#|)7e2a;`Ij_S%)<+^_T1dZgqZ^hY1NW+ReTnsEJZb
z$Gt?Fw(gzreo9qu&`F%SV|$wy%QfL;ZpUKH_^);$Dux9tYHO!H7dBUyX^S(y&(G#?
zdRcRp%87`<ja*wo{_9gVaa*S_Oz-$4w_BBtdgXY)b~a3IAw|S4mWN0xZCdRy+3O`?
z3hb#I($Px0FlN_lA_7(^hpB3$kEOQHCIvjjY^^-FxaMA6w!_fF@becFQA|}uzenm_
zo(%zZ*J~zy;(_*?E#aALIbAy+ZXVWTCy`<#*#^wFO)_&Wo=fOCxnC)}ot~&UZs|#~
z7Dh`XAlXSUnm=7S<#ezpxgsOUGRA^xSXZlVw}l2%w8L>Ft|Olxys6n0XF=2}*F<|P
zt@z4S>w`Q;3Q)dCih_jEo0HG}ipofsjoux2DmU4{jdi&=!@y_440k?AqJy*#zZ?*F
z<J_p=r}iS!xyZwyh@F<`n*AfbMVFjwX{`nnrokP&-h;AcQPfH#3A~>ydlsYyQ&|<|
z&)xgPdKfdoh*?V8earsRt<RCcNl!J2`+J@jixq*w>goLK98<cm?UTOp7O!K^Ik@Sa
zR>ny(S+<drEo>iJ@s`FXC$3a_D$J}7?q$2-y{-15bwAUH6h2e?x=rmCkI6kHq=)F2
zE|}i-klv+4s(v(MSDmtySZZM91?d4#yzO4)#nN=ee$G*~a$TvbAt(5}J@W@IlSD^Z
zdtGLnu{nHE8YUAWT^f5#OO(_u;36U_JkM}`@u6V5F)MX61vIW=8{hY^HTEl)#fy<D
z=E!{#z|2&4ocv4@Ket_@%z%B96WBP#K`qy<z%fYIUHREhP`@U>Z?GJDJ0~VMU9Eqz
zS0PA0R`NotvPZahvU;8*A6EHo*j`oeLZ^=;#hTl%EH=s>r@A9Bu*BzM2|W47jWP^|
zIA+=Q)p6RAZC>~~zWJ+PE{}Z+J<&C_rgTT^CP!+wvF*FBe7YKAbdFa%lvHe!{l}hd
zonOZ53O!FwF!;~&ZLjoMvZ-m~eJOUBY&%Zg$^NRL<F#!Ae7$bgDa*@uvE^br=;h7~
z^PRFjl_=G|ay<8x&qs|Q^Q+0r+euzi1hei=WU4bQ5;=BoAeUriiMT9;A@o*!07p7c
z$ML)K#$59U<-NRQ{R}UqKGU9SJ1Y71YH@KN@N@FUF*?i|Gm<vhcx7|>YphmTq*Ddq
zWbPUFogQ7C&e}=ES9IDo-%j7(qVvO0Y>=J}u&vd+6uJL~scxyLZna8BRLrbOo`Pwc
z2Bx{daBAeUSON)~Ld&eeO|k$A-szES?!l}Hdo>B!Fr%vz-D<+CZ9tR;%svR|dt*8g
zwW0ijW6M<kn7dEY{2+_R#3&74v7bPNvu$Vi#WuFjYwap*+#8ZO)s>7L3tD#sT;(Lr
zcP_W$7ORQ2-*ZJ^>C$*-cDx>APCt8a#1|WtO<tGVxohfI=o>Ft^W(*X%9=s|t95^I
zWYfu}v&t&b^jbb<p$|Gg!L3--1#erNw6#ok;Va=e**7@zBVkEvuLV+eH0FXAZJ!5E
za5M?oB?>P!?U7dskh)d7<>bUuo^7UkHZNYaW)Kg%66B_w7qGMZn9DXdju@S!w5gF!
zp;@ENJoHC;DsaWDacknW4j{^)mfcJ2*z)+E3#L<ju{^Z;Ys6W-Sam;VVS9rn?gfpS
z#hxNp(KLJz?iHVI3O*umi?p9_M+{b|Pl|Ze$$oXz=n^URoHc$)Q6~TF&4%DRJj7)U
zb$a)viBlbUiOW7$2-)0QlRx_S(!m>+x+S3!v7+3UtXxWYIwOUyz0ggMuivuMcO~r(
z_DKqDNQeH_Fbazc^2*e8y?&V@W(J()uHG3mk>e3w;SWgbh8nU}{oGg7aWjiUVPpRC
zawj|AvDalS4m+MZIrzRQD9bF9==dEeZo_ikM}}o3H7T7BEBB4u?kmL6oq@wkx%`@M
zDbeXHk}yp1_=eD!%PSnj@>kWBI%15kj&5ra$aBXttNMIp?d7dmctRPFDU{Yw)2OsK
z9}KFuxp0?Uexy~J;kiT>R_XhU@m&@N{VYzeWk<5>y^QAFC;cg}PdT4vG|)B6$o4ip
zgPZ0d>3fw&AHvrq%ON{C9KokNajZV2!R(-<(^~0+1v`~Y&43r5osSnEz8Nb??<!>x
z%W8a@$Z19>u0!Z6wvdNd-B^3OO-FveMfa!J{1XzqJDtw8Q%W%ouY}C%RwNv&t!#Yd
z!$G~7S&M4hHoqV}t47|Ff!;i^43aP3^_x*$d~(E3In(i3193^-P@Ipb=I#>}QLem$
zo6;zztLnZadMt#VdgWucMlV3=ZvJ-8jh01xcB4$Wd5=yUIAJj*{`sAvwDZ0eDWL_*
z9Qnfqf+ds}KARSH^w@gBBd_h1v1(`4kH=_^>Frj#5-fFm7qNWYl`RW(<vq47O}OdS
zsv|s(3`tQGeZscc6PZe#m(_6#Mi)qrP0MZ6=U>@<MeEW{ackK<_x2q~<w);;V>FWg
zZghh9p-f#U(`EX#!*zCtA1GeGdqRl0e?Ep9vvaaFr_vb9@<vGLs8y1A&fQb-YP_5l
ziLrCHhxB%n1tbKNa?+%Z9w=TkZr;8BvHI&IcM>N*(>?_@bMrGU&hE!<ztg-Bw&yfg
zo1yGhjHp9Cv!kR#5e9Q-daO*y>=5%}uhd3t`;=yL|LF)8mkQ$4!wty=9mzh1lNl0X
zCs$p=WTiN;txB>fa_&cWttc6v->2aQXy#W80(18~YPPR=cBQdd+tU0XqjGbebT#p7
z?kSykq2d#<n9gfQKK2$$Kbh1kdkv)G_?yq4Fy&^LB<3E;W%DiI>BO_s8@Y#Dot3?(
zTj*PJ(eBB>)m!>biB5O?$%xC07pQIRI%Y&8Z_PZnB<_vrs$j*4)cStF*>Y<d+xich
zc-*jVuDKn^(kHN}P<1<i`Domi$4QOjQ75P*SPy$_k;mxd`c7=Q_FOcUB9vP)R);n(
z;gnJ;Bt^f?Zoak>!~CT7QCUrvE{hW{ogMN0xM1o_yKJR%=t}}Po|det&$Y(hxqtD9
zMBihTWjr@~(|ASh#a-^E_jA8|9msX7`tXFgW&@AbnDM?=S)yzG2?mdvw+MRU7jZ8z
zYjv*9nGTN_{VFOJX3xgWYYQ*c?7rv4c|$jwbDk}dfBy?1U*4gLYsZSEYfs!UE{G^t
zppnOXsW_)QAp60iA;XmG^AbotXZFl%v+^)=4G^`PbPvxp6WZ<gD3q!-nah&vW_MTl
z*TYV7nv^ecg?%4)8W_K`h=}*!Q-AkM&ncm)o7QkWI8555!6Am}`lJkqO{iD1^A8B>
ziHICc-c)z%Vw0~2V~|4Gh(v9iQS{P@>zP$9elI9!7jtk<yw=+j^|Os5W4|7<xG3WD
zNy<f1c|>4kKr^x1Fr#a;oh=8^@Z)N)?sC`VOk<5gmV>escV202O}A@Kr*@*9VAB0~
zbBW?k@zM^5n+q3Ws*N#KexlA<6~W3IEbJfYq~1N$_mb?b{ec4l@|!c;2fJR=Ox|2^
zuBTzwa}uTEe7bObp4`69q$Jis_x0+u3e)&~4ln-a(Tx<0tTn|E(w|&oF0W=vZ5J^D
z{&Bs}7`pfxk7kuVpD-YL<wVK%`US)P0bM|%zo%cS-Q00)m+qvOqbQn;j<!)>Y414n
zC5+NtmHupt@AgA`n|B3PRaN2}?-VSi(khz)5IHUYntN_6qTruX?iA1-Xo~=-R^_JL
zDlNDa1mwQG=Y;~`C{osyDl)nqU{LaANWUlc_$$lE;@Kg)Su3#t<75}(&1|?ZK2>W^
zQreT3GiLB2Ht;-=lYfSfx3dfGDNveJ?+kO{bt)KF89W(sN$vk&O#cgU0Jd#A^F;cg
z9x2j}bhDjBcG8W?M+vPqk91X^pyu9sWVCk#hTjaI?jWDo3EY=4fC4~=dEfGyOdDR_
z##0vpwFTNqec#Tl(foU3-e4Y;k|Ib@@(zP^CFwpSYKGL@%yVAYA$FV{xRZs7cIaWu
zIP+^3Ok-Yx9c7*piof1xb!;YUl0WO}y>TtP*|ZDH>KIiLE?`LX=>9ojfO@4AL6t&3
zaqrhVJ1nNX<|F;K14L6li;!OU_&vcD;w?ee5B@gl2Q$r>wb=s9Q|s|A=wuK4mGx`0
z`55241oPRn9V8~uoqaYBJ}KWme?z;b0Ig!Xd;sS4U4nErzI*4Mha+$e40tEVsX9uC
zI1WVIQKar>G=x$#W4dt>n|xqCS`nM*+~te-sv52eJGcx?CwD414l$3bbut1=IIo+5
z<F-D~IE1fbE}^fw$hRzirovXTKL5Fo-W%Tj{uN;ktMMiR)<|u?5bQq;rhj~YPq+vH
zFrWL+9y<}<c!1#sajK*_t|&{N2mXJ1es6e4-<+`h#HnySL#j_6JrQmK2d0|{u^hwF
zc?#Q?FTaTXZFcM*8H&cK91hbDhxv7EUMo4Z8uj@MplI{~`D?nee#kJPlGuaAX#953
zU8HB~8|$AHRPejx0R=BdUwvovstp$t>C^h+Z{-1MJFKXG5qAVGx(vhRPAq|OZ|`{^
zqK?zK{3Rppp47!xkr;@gc_-m4n@?xwB#Ci(c{jwm7b&jrkm)1$@Gt&W)9Ms*jHmWc
z7)X3e$h#jwP)2-s!Jm2c-QlBOT^KfSnvP3!TLEepzY_73fjQwhgFWFq7~h;m_;-Y$
z6>!@~Pk8s?k*MM8%P_9t6?1#T!(o2YEQbl6O<qqN=m}p!fc(di<KcEP?5k^jeO`_3
z9epiiBhrhXj9FZZyGD`gpV=1*;Jq-XjyU@+%fojWIAnAk$w3%SMjk{ME9mFbAA$|6
z2a}U5n(Fqo`PJrGAHS9Rr2MFy0&K~H%1W@Se^?f&nfH|1HlTOI&AGWPd2sHlaW886
zb=<p&KCnZS_WA`Hb&k)XwSX+s|De_p)C#`iZw$m|)pt^Q=Cs^R{z9WNDsLk3Envsq
ztDd<tMk|vSS+d$+&Ghq`iu%(9{o!@<`@%U~YY8}x=P!cy&mHUy7a|xkA8TzMzJClt
z;z<aQqt!%O3tnr_`24L4`@_!+v5S%WW`_li{eSj}yTf_sG%@xUZwg%n_C_o>XMi&8
zoZ-B-5%!DEGwrRVumzIbLLWL@Yt#?6)%-K&ugz~NuiYBy`v3VCmzBDe2zQi4YUX|0
z1VD9M^7Y)pm2R<f#QJFl>!ARUNoxtiYY9z))8zNQ_I*mVjW6NuK*&gd&K(!!fjN9N
zGXhV2=-tSg^zIOgGP7F$%?ta&gZe6r--tO?at;N45e?r<^=f;-_^ap43coPa8>?OW
z&gOcrAaHo_OQiq91$|)!_)}=4z%cE*|Aj}w&oK5+p7$4DYV%D9VaL%z5e_Rc#{SRZ
z>zUt3K1Uji!<{8kU}tg1<}h`4;&K7Z-@mM^%$wk|s%9yRn)W`O;!5&6b?VglM7dXP
zG-m>AU@K}aH`58cu+y~!GU@NV#3Ivb)=!iu6f*k}CYuuSgF}A_E8u(%{PeV-GLWIA
z8JYGs=k<m8)HQSb<)xKj6%tRyq|f4JJgi_?gAh>b=YeIK?x743{sJLzA&$Z<pS(0K
zclHc|U;pXtp`W!6NuEgYo>^crZB!b0B_upIKZS{2`7A%5sQ?__d40OUj)>-eX<e>>
z&)!(&wYJoH!UBNGEF%0FwfP3&r*dnhiL>a6igwJ6l=wn~WrD<OHEwpFm_>|c=kGuZ
zIL0(=!yGifb`gcdE75HwUxx>I0Hb)W-|zqY)_8(X(LRCLbn<b${&RDCDkp{K<y-e$
zpTLVS<1fJsIK;I=;rWd7HUG<)Im^m;=P;D`V}={|K?vko0$F#w_W9exMdu?}69ikt
zyTVr?+|KnI_GfLrYJL3#tq2zzmNkyc(d!IBt^8V#<yWVq)I@ka;T!PW|3_GulBg&S
zCASV#;A&F8jev(Q0d9Iz2;X~M#aWgRy<rrQ@6t;hJUV*3cbu;C$6xQ8VPHE^@39ca
zij~W@{#QUPp}s4bfv~(hV88yy`4+$*{_`4qLXG$3%WyA&@qZ5O?{!gjjf&ChyOsL9
z_s9tTf?45OECTarHXnTh=JVV8kA=IKF{sXcj23x>;mpgPwky2iIjmVp_4->0ncqb>
z&Gg}DsU;WEhR#_XhD8FoQ8uQPQKdN@z0t`_L>-fNV@cS@t#FgOU<c0%;c?%?le@H0
z$wkjd6n>du!Iz676}@Yv$}vBf{}o)knJ~AgfHCj(e3K8tJ&Z>hn=KZtw{$InaENIh
z2}Uq0JHT<fi+>u8_hQ7Hf3^AKM49ko324m{-QktG=4HrS`Fr>lz}r8tJkAQ#@`2j-
z$&C3m`?V*sYUgE@_grz;1I|oe>nzNRpZDAufgc|~8D5VNI7ntyF&cdt&wA9}@LMn1
z7RyPhsLbV%Y$v99E1Eg4lJ(DRwDi&p*mv^=E3-S&ot<Jng4g@8D0DWq0{V#7JtKsx
zKG;fX(?IaywFrTKXeEP|h5(u$6!w1z>CA8K7GH@s>cZxwuW1FDeBn@q`MtJd3|<R%
z(mI60(gl_K)c+DhvqX`acWpe0yUfbaV~*zhm1ph5e>iKjX*%m%*rVw$R9hUcoBgNw
zRhYD5LDZII`b!xGyi^AwFRYV}%u(i-x>71C6S?j~hu>r}zm;#MpRK!fTe&!dv(V12
z_3+sk+N0&J|JN#z7F&T;PiZApmS#Q|?|>(NFMH6;Z_@&x%KUF=rjT%)G4TEyak^ju
zXzn!C3pFL#d08jaJRZHY1MINn42@P`Zchf-$C=P+ro*k5<JveqXhU+q{HsFXSD(8h
zV#>oqG~Hhg^SeS%2JRGyRtbKDhcYZXG(0Oj?=gESnx7#|lGD$Ih{s)M|5nWMj2R~b
zAqu0_SZxm9guMQ?JKX@(G|=u|(s_LTF0A-HUt{=l3;nNBZyNp>-u#1RQx#C+_^c@4
zRXqLM@oySDIeZtM*AFqz`bn7oho~IJb|*2>TV;NWkcrM-&iG67Urs3iJI~>OPCnDM
z11!@m$JuT#z12Dotx9SU*6ZvQl@)_UZ7@I1=)3Nl^TVIKXH|Icv4QYAi+dW*<F}(}
z6Un0&C+3gY$8q~(H-_gq-6t|M2~xnmap>lhUn+<<hk_~t$KtLEjEjU2jqU1d>*A1Y
z0opyYF!SnXnffhkL$l?__za#-LPxHW^+P&tp<i@4!sn?3tOHL8N(X%cf0r6vGD`xO
z|A|EWZ$h!CJF<o4K!3@a>VLx?tahq)#^?g@<RZt{Ym|9l!V><EiAAmpJ=$SfN_3if
zI`T57q0e+aI(v{hbto|{#cI<+DG~r#>v0U;dgnkGe&0D^?Us4rWG|dG0TgE0w(S>_
ze`N7+7`k?27`hxhxVb5%Z?X#57N<&69e?6XreB3wu0ziBmZvbbADZ?5Wc8^1H9z}j
z>AZ60H{C=8KrGh+s||@X!F?t`=dKIkyq7cn`xO*I($H2d`wWD@Ct}tm|B^@v=3hfT
zUnQ(B%x~-ehH%5H>C8Kj(8MfS_g|P%!OL|>$2bfSX^Eg`*#$n$O6M$lJ4LO37XAf=
zP%YznuN5E>*Dh1d($@c4Z3T!{Kz4XZeVqDxEP<1UdcwZzR)mw+FAsB$A^;TxrxO^a
zoM7qltjqU=xsTr(`j>%&W_LE)AI{vkQL7fGdF?Wedo6SVv)5saW;QQ{u#3191{3LP
zmvW~HXzOZO*&32o%gt>wZL+JE;6J<a@A-XjUd-C(8@Ra&A%He62HCU_V18N`z5T_^
ze;bv%{@IvfcT}eNCWHAU0KKNofE}W?NEtTd<*T`=U!k7)jju*mAo)H;^@&YQR)vQ7
zr2xz}dvGKye(L5h`0`uAiHr6jYe?IywoNIkEDm)!Nf=poDD=PRwy^l8?+X1(VB%((
z-*Rle7F6w~o}%(&U!s}oncp<hhSS5b9}@9eUzUBTBiTGs1X7mX;&kqkyzLN&1+;7L
zcS88mlbCx&$(Dn>`j=JSEsR}?DKdEqIaq3b+&kYD!na;Z6hzgisn2b@Av2#f!@W)g
z2}je=C85o9tod8cqPdmvU{RT3$NaM;#6@+p4h|>uEH_&ptpS{ZM{b=Lj(!bIc=N(A
z`v{*J<9Yn1Us1WOzEjvx=7}Eq_uPY_?;$(G(8c@191izriSe^~*o7YUq7_i4ImNwe
zKUO2mi~7O5?FaV)l;xS#!(aPB|IT_WOn;l{7yG`MVN7kg&0Kq4h9?F;>_Z5tN0+4(
zyHCXm_*hXytDXxmKdLQ!fsk)^=)L^XlOd1;ARI|GnP8JxiW<#lwQ@6{wsrNF>%W^K
z3<_~-(P^F{!ZUw|84^MvR;!h^4h{q2f)X3nB*5^}=+R9>;plf3hZ8p~46}CkhuLW2
zD1-IFdkRfTUFwUL;o<dD{GP(hKe=c)%s&4>=)ZDL7(DMFAFtu1=lQiz?a`uazvF(F
z=%#@9U7vujte-u_$LnT5fjl1zps1hg^RtL!SahC1)r`=n&nNX>AuzNqgsZP(SbAPr
zs;?2@YJ>pVuQ4F%3gM}|&1e4OAo+%0Mhn1LZiSs}5ZO+p?LCA9#39m5M;Ui9($o27
z$7`uVfVua`q-ACKUO&MAW7HIlrrGO~)Uwrqc{s%k7afCZ4~N0^hr-CAZQ<CK!EpSx
z`QgL{ES8<f?t{HymZmw{yNnm4-BSpSQwo;(;ArWw(6jDv=(`shA9KHV0pn^%gOocV
ztv}ix3UnWacuT3=To^M&Sdv|I4wmIPS`RL4ylY1^|I7NhHeI|1A1W57FjHx2Tpmq4
zm<^vD+Q{h}uVG`x2Pw0Vg&#uBTJS_{47sVR<A;|vPF8(->)+7$?^VWLg%R8MQY5=K
zT#f9Z56wzlojWq9=LeEiA(*%0%2a2aM5hK($du)6Tm_M4txiD+w-tc7W$+#j6zMtt
zU<mg<5Xa~za2lT6&vM~CePP!AJ`A*oKPiF$05iTxL_t)ZFuV_efab0oJHS{T3*jU;
zJ@bx-KDH&F@E69P4YT*~A=BA98u<$+5F)LN@$=eaA37v+xRlDMGN(8#!;^g`KJB-%
zMOW6(`1)|-`4DB9*EPs?XvH+6Q0lU1=~jB&^1oP{@5eg*>9Q<d3NQaRRzSnX<q80D
zy@YUeZth1mhoIw+_2g3yYEMmhP4i}?ZblrQ3Y2PkN=%kv+A=VTg30v<De)yxwcv^-
zolcGFVcBYDxCHVDv-$j3n8yJiSO`gsV^Z_b{Lg-y*q%(v=D?kJ;;5|utczoEnKV^9
zn`aa(CbnS(5FNbGgX5-@KFhMQ4Es+yOle1!t}VaSHr@0W_@=If2N90>a1F06Cj7Wj
z6lo<79>)u)IR`fVCytiw6~yI48S~gf4@8uGqPdjakBO9cBl6WCt{pEUp!zH&BDJ)R
zy%d6KCV$)KY=cLGr4W-u>M+1DWVKK0tK0Iuc-A^lbIZv77KJR1X%qsJ1>(~gkkn~e
z@o0Z3h}#Y3?V1da^G@ln+VYL>8k33aS~L%NK|18G^6NzcD7f}weed{eS!8)wUTz5F
zk_(T`ed7+MO&>@nTH7>_RaL82<N030FJaMg9C~!D<H}}{Z^a}ewTL0rj2vwc6<Wra
z&*`MHvQvb_Xuj2pg_!lJpcZjR({>4vT7fB!C%*UWFUNaUEzMmwP$lw3maO#GdJMrE
z&iR+oKz(M?p2|n_zZKe3`Dv&14^jR~_+eF*FZdlytxuJ2HomXEc8%^I)99hk8V&&%
zGf<0g-@HV@Wxg)&P0@}r(6SXc?dZf=XgZNS3rE>0f?omhfXCgt(CCZ@g_JxiF)ML*
zVuGn1n(^LBOpX!t2b2v3TwG!sz;_bZJ}m%ERyAE&d!iM9nXW}Lk+lj*l{?Lm#Dz@b
zvfVPgSXFW9Qrr~hUv1OuFSaNbcAA34p_lb&MgZ7ft}NB@m(qs%dU-nb0u;nTSZshg
z%z?fHg@SysXc^;s*z&+0?s;DX(xm$bMyO9uq0+fbp2iQLrWpfyC7`MUDmn)a`~=en
zDPC5=$P6&~GCJ`TjBdx+FWC^l*@=0yVm*zHTYQnorfH@_6sDftmoooe+Z@Z#u_S@z
zm1hSl0GeHVB#|sC?2y`;kUx#cj=mcEW@g=qXBj6C-}S8(k2*=40sAbT7)-x$*$|{d
zXM~zH{WD?R?EaFhz0%s*u-0mLLLbd6W>(@5fc~dS6-C3dRVb$30=mo^<uKsLCc?(-
zN!~9SIAC6khdcqWj;L@)4rl>b;Zm>zk+QRCY{TB*eTE#FQK#OTrOiovJ6Q(W!gwrx
zfyD=@KqDLqXhCH4s86Z@Xm)9-1g0}EJFuia&-!}_OHXo&#ZaGiy4*M0M*BmWXm+N@
z?spc(K4=8yE91MvFE37`bk<+L#Ws{pPS)d+8PD>x232sbGw6q&PH72`i(Pr(^B&?X
zf^YKTj3%W{C$O9kBLs>;US8KIN}d3KD+=H^+CK%Hv*ZAaT;{6HBV$1K`@7&n#!NNO
zqNWC^n)6D3?^8eod!~QU`2of`0Fo2494%JqoNXf=eOPc|`{S0_(q1pfq(fhcJAEBx
zTcK;KFT*)LZx&=~hZ%YOC2hoI-Fc+npg_|ooV7H~KQHgvr1qz4NIXkG#pW6jpYc};
z4hQ6$G7H;`%<}|}<Z@`p(|K5c3yUBxS%WljD?x+jd5=V}oxcnT=*FK6ae48x=XR1F
zS%rWf??@E_mZy`+a=fkw^Em`TMQ?BY%Yu75Mh&L)_qq%+B6C*2_RlDwJv#nUXFKc%
zmr6IS{VB&a9&sH;100&KzLr#)vJdPR^Yp|YEzL5zva9WH&3zSFR08CYLIb0HPJD+3
zNz%eZ4ca=I*XTt<p3dXs;vCD5FRqk&0!D0hY@XFuglVd2Iw`S5d>In6&e}|8X}Xtn
zR#ukrEThqaA2AU<^#{}QtIb#IqM3TAB_pF0cR^M}$N$=PwWq5$Qw5v-5VgPD)FkBj
z7|bfO_R8oeE7H0<tW32m%~!4OAtDZCEw7PM$@0(pRrB*9mQs3GK#0>`tc-GK$m<Os
zk{29Q0Di$Vc4kQjQMa%cFa3z-OhJBmud<Gki8P~9oHllvYXsZ^oGJt?$GmkoHR>?i
z>Vh3-EK-@-Hkq&5Mt?xomb|uR&#kjF8_Ci?OW2{mYBHT>sqN3SboBf;(I)%IuQSEr
zS|ehkFQlW?^#qamVZvsZUyDzzAEzxTFSX6J&9v%}Z`j8)W8|^gmvz_X-@`CzXl>n#
zhPMrQI!^%XB@1%T0nhbIIpbd>4r4y=!1LR9#xbXWy2mzDhnhnInbex)*RH~(PBo31
zZFTuIx|Q%W*W%iM1`#A=SAF^0h`t&msXQWI{TccMQb3&PY*OHq5kN|Qr!$Ui@%;Tf
zlf1re_Nn-9c5Xg0E7qnTV`)Sf(vN~z|576XUtf<%w^imZTj-N9xfY~OKDB=KtUlke
zd+F*V>2)-op!_nhoS}2!jU)k}Rv8C*RJom)P+$%WtcDREkdDlKXyYAZ=f(va*$UWL
zJ4UC9TS7Gxix4gIIQ-BOa85X-TsJi7duT*NmVFeVl6^ZMwG9*<oZrv0E*j=TSsozj
z$4D)AI)e2U|4l8_0$EF%(9+_?a-62sL6_$MmKHLnav3hoFY_h&L6IW-OMf!F2gPKL
z<RY3ZDw|>>^KV6n9is1x%FXi8O|blG6b1BE3~J!bjiTfUfUQI}<ion><-e<cV`KL0
zgn1Z8pJcIvjcpY~uqIYEBq);TcDx5pq<U}7Tj!0YT%(S(9ZuI}LLjvr<`u0A+RkMu
zn6E@_InfzZnxeBJh$opb;wh!UR9_sqGA~t2+hWUc$fTv`u+RH?kUpfB6lZK!Z<R*b
z)UWlGPP9s!8RFH~&->U8Xk#<|y`PX|EmD3D!e{NX8|7I4><%7msekBwLjk~wA#Bia
zL!$(v&)kHt@ZNMB9gqSxaxX%_!y1OE_(aTGk*|k%Ey!O?=X#$9mZM26kw^>;Xk=W{
zB9q;A=&9dKJN&?e_bCvq3$$C31R$FE>O9k|O)Vp(9#av={*d|eG;*UlNc5Srx6%9h
zXW~RA^&rkXN3&$%Mk2Ly5sjF)`9)y@^A~OB&PYuf(c~{$LsDPoM)$V+FlBGphO)v+
zPvr|0gW7-TRgH43fA&3iy5X3jp#Z3+hVlxacTouE{Sr)75U2i?jXi>SvjO8`zsyMI
z(Ds(V4i#CemQ<GXnn~6<^%-VDAbz6Da_b}1+?%fs!6Hp0v1nO%{#+5U)MOh8Q0t;7
z-`e<ITW`NQ<6ePO!l=(<s_(^(>*Vv8vQd2$%#Qox$6oR}u_mEqn13n$%@@B;WYU`S
z^_Lm*>+mab^hz5JJ@xV{tb~N>RwCB@rwNh%_h2b)LOJUzOVa$+d^~*Tf?p#Ke#-%5
zHHL-)fOW%H68-s&lT@D`yp>Z>{$KNi=@iEZnr$<U-Ueaw-TDb>I6K81T}or8V7zYx
zGElAg-Y8D?%q%3xBEm7hOampa;3H#w4AD`P6;|?_G1OG=IWMVJ!`If;X}D(A$=hEt
zzKmKloXZN#`Y~EySf7^J2+UV2v253;>ElXLnIF2$6SU^%%!EVFW`ua7y2<FnFzzNa
z{&x7lm4SKfXp}xaL-V>Ff0%lg!}F^=Juc=r0YF4C`UV0$k^@K4;Me~Y9sMmTJ}!%#
zL^Y3XtmhNly$d0N>2rYViC6;?j+=oTj}<K*v2*t^hL6XB)zqi{($1Di%jTkOM>Ym6
za~N3$*~WY|ADdAfT3K2zYP~!HCCX`qh+Bq6&2G__emtKcMf-_wXLq7+M4I6~7kH=X
z-)L<|md50o(vJDfHxIYcLpI)*zET8LRz~e_tS)q=*1roLI!K?!HWam^HiMdL@wVlo
zA$<L35Ma=BG04krsPguhn2osip~QIdm{_$pCmBlI@Jg8XM<6%J{HRSm_S!}qhp4p%
zwlFk^zJW$n8k?-)ot>hDcHxknmDfpjvSzhj_96l@z1qVgz9fJ=Fb~JN5`Iy<EWP>a
zlq%CqS$uz`4+&LgW|4_nn`SDn#i<sG`{N5xHLgnH!jpJX9CRiY2d*jbx2*4U%*`j{
z(Si9{SA|KDV6{)3!6~ya%`Xq^KbOTHRqIpUHt7ANYf4(A^{gEEZ_B4b_|flYPl|r!
zX?c4LG^)YNh=G{Ofer6r?De6EGk=!79jP_oRN)2?vl{_(s>NY}o#imV&T=}*4sHaN
z>54nY)<uM*_Q<Ap8s(EclL7PzG~*9Y>!mDg%F3{wnUkvNO;1@SlkjW3;!F#KEwgc(
z1jb!}piZ6h&&o-`v8PnUek-Vq`6;xu2)(X{jCLIBuUK4pn?0+KTXb*UC!psYjPW;Y
zM%W<0qNUNG%%Xhq-btf3{|#;VLSBsQxjg;y>?ZpiBLFZF*6`w~*~@Xi9s=qozS~sy
zG5Ko(o<J&Y$28i&&_K_q4vuEa$~wuzHVs1dta4ZRZ^!6iS98S<seM_!mPj316j>Bm
zGKG)x$Eje)`Mm;v8`ZB-KGs=l!==tCLyksywDXzuaoikdSW2{VZMot~Yt1zssr_p_
zeJ*9Vvd=t_rAWA1Zxw*Pibj2J{d{i!L_tzkXl>)F*Vflzt<G*w4Bka>#%!WmzrHBj
zauS;O$)A#cDg8aA%HyNCj1d4ly@s%4^lkmcmj6Lw-UM=B%t-+nBjS(JF&mMVTQSv-
zgTzLiR93z_sx;mFzMJXo)OzrdW^3U}HPWgDJV2yT9<_NkGbA!EiPS?$N~yZ05IbDv
zkE<LdsG4_Ed1*alNQWh^S@1p;6!tP##A#|G;u*gPWPXQiI)(H4%6d92><Yz3bvB*-
zs1@ljEia#<df2{W<e`&q3v@hOpk#{jNqwgh+5S0%+Skic$lKS{MyeM8PpEx4_8P=^
z3TY~cI6EXRHIR9@^gbSk<V^uYDG38dCm|NW9tg})NfR^{bGHsoy(t9j)cVdeKRv+F
zsl;z1Fpb(uf^bebwxOo}Sv$29=EBhRWi-k&duCqx101eITxBmxekELmmS^ABRHY<U
z`(8)9(I+lhZDy|D8i*fH;z>O>F{`!s5yuM6*az8*YMwrm`Q3z*Cn&79!ZZ3R8X(TP
z*~e?iBX4(TO|j!HqoN`GPOPH47|uo~AF&z<nlXtfSH2i+7qlv|d!N+7#-%^Dd~i&z
zIX7c?oZ40Z(*VqdE3^A?zel&jh+nJl^SGCF4@9Kr{VhTg#}7EtAa~S>e81Ro6p1(j
z0ox%($Wf9yW$0AN0hq*b^AL$NrNm7Nh(t~JwdAF?s$bikF_Y$?Q*@0O*k?Mb{A!F#
zTOIQ{9v37S2{X<Z*q|oCy#2<3TbLb>rdm73bNJ$Hg%(PqdN)~>rlLEOqC+yZzGiM;
zI0NEoNs+jlDiOw}4|VdYy>;Bh?YZR93*+0zYV1Nw4V;<QzN@D@>!XKXWud@t2A{Fp
z>4*LFp{Bm{t!gbZ_hot~-Fl&|0qQ@5vg#$F?QuA)kiUB$^_5p*A^8aN_4dzzS9okT
z>GR=Ad7XtpWJ<PKJRY9Q^$!SOu>tJ<N(i5SWHrTTvhfi`oXo#&1;1QJjc+v4*oadI
zgJGUK?Xn+cI>OYEU%XF2mSd_#n2t+8SRaULhKMX%ZKbxCf$IrZGwPg`l`;AFR#l$F
zX7f)kv;s|Vt$>268G@;#9Q)t&E<|Zmx3MwD-)Wu5028gl_?*W-5pN31IF%X^GCoeD
zC-#-AHXZj*1hPJG*b!G^l9^0j)(7^#4n)V}wLrB%9F`(NX$8WmP=M4XtX@ZRpZ-^{
z>_;fZ&*PQgMV^qBhbVIaIMEvN({8h9Q&u0dmnRPMU=vg2GWI>}>f5nko_#3D$9|W5
z*N=TS_D+VoF9P5WA5)7_FjZsbH_CyVD}<ji&Rv2=lKEwnQQ$rjs1@a*Qq}_y4?_@p
zk#dJ1s)(2z<YmKxJ-et29j?YzlSq&$gl`<RwHr4iU}pfbrMwoYR?s>qnvx=m*h3PN
z^^dn%yDGOW|F<1#pST(g;b<8tV7r_eQ+p}ICKIfaTB&RbwZ0BAzuI}C%x}I~AINz9
z(18zpmO=yeuQZtn3)5*q=`+xA=xkI<J|wFsGwQGheC$GSpRmuwe?h;QxBj#FFzRB2
z%>Z#dE>Gr;Z7|~+kROJ~b0Pe+duN5#YhI7Jv~Px?1CQ$~769`2dcqy1uN44i`5+gL
z!FX1(5Nsff;iisL1rjSZmc}V9t<GE!a)=J_)R;pGD4yprfQ%yJO7uE&G=Gbjwzj@@
z0Lh~ns@d+A+su^ozHH9FKCjyQiC&8ng{xy&*SDukcUh)*@&FO1v|OT^O2Dnu*Su5a
z;cN3YjeeMRFn?{nUN_TL<4Px<G$d{nmiD<m7!`x>`6~tKM<SB;_7I{4>m!*z2QrPn
z06`NEBQ<VS+sg(;kkxxnEVv*h?=2^7W#A~U*9w4Q+bnt04~<jGMFCJy-|SUHI>?wy
z1BR%=VI<>jh6aZb2KGoJK@f4lKX-s_LNba}rFbUUv%K?uMXDOyX+p=5-cPpn%&SZ7
zbL*iAs!8us*VOI63Pt?^GFmgoUDcLdc5OfC|43f;Ooq0{`nY{tn&<Ykf_*G*?Q?PF
zy>&FCfH=@n^z24Agqf@neP(%?uYzGdf_VvoW>6$ZZ{*PwAmEEEi`u->@*d^&S^<DK
zEPx(Wbs2yl&6!~&Y;@L@#jklE`vBz31xX5m16TyR_rL^5S@G5g0V3aY&gtq?(7W7>
zHDI<9LZiHWcRE0#Q6p&yxY^}6II)UlU^>U4ddZ83vMirQ?=4@SheM~N$`H-^I4sdB
zNfpeE%ASm8nXqrxhosi`v?eVs=_apVY`ZWB4KIV1=OWYtY1GiIfD4H#(xyE1oD@F2
z=fv;GTGd>&+4I3uM`~l|#Z&_A2T^Iel*Z>0kq5FCQkb95Ua=BMjp;VVpHP=fg7>fv
zO=rK$C**Zb<!xvG7YAA>k$DoUDh{-WT5%$l6=B}{(Ef1-r7SY<rGNLT>(gk~sg{ua
zlt!`bw#T85dj`ZoR2q$HK}6}a7hUStuht{P<;S99z#C;V<{83cUzMiEJg>i@Y(F|O
zNV(-eBPvkK()4fv%)d9CsgY13#WZ>L?S&AjP%feh%E?|So%M<?gZa;<QHv1-N}xO`
z8as+KN@y3!WDdS8zv=dC`!%1P`ph&+N*i=;#;@X}L{5yO_SfVU$0=YmKW?EAYo_ig
zc~4~=+bhGXSfceq;dwZa@%^&TvehA+vkH1u>w~w6v|r2VZmj9U^l4tX89wBTeiUZf
z@nEA8rB(oJ&GKsaJMVm^n(hVvLzrct6|i+X6({?N2m#B95*6cQHRIcL>oD=F43URe
z5#@N;PKz2Vk!Z`>bUNOPYgaSsxJtphZP!chT8lD*No|k)Cq7d?r*59s(-|5aY$<h4
zt;bZyi@3z1au`;GPwCLP{P{~!RBN$VQCPE~*9o<qJoTgPgO7IXpf9;E1NHKvE11>5
z>M9536)R9H05<Rl>dArGG;GzAXl%~=+Q%Z#1w0$K&<JWa8*`9HErVsc-g$_yjs~te
z3u^@Jm^UntX!Z2!NNaYbTFaH3tb{b)g+i`3NCg?TT_7Pl_D$(OnPCtur~sG{1GKC1
zE#{Lkf8mM1wAD8DW8<F%<FCd4)Gy=1v=agGtR=A<`rV37j_dnVkWacbPfsbCN(Om7
zLahL}izjew0jz&vh4E6PfJoFKXd|{_ad_~DCe1_ue8bUbMe;cZQ_C-_1{kB!u)y;5
zqQ_;vG2ui?K7DDex~8;Dm1p(T*3Ne85RHPb_C*WV5r+_B>Xf8KEQJwVu{WI|JCCu^
zib2~Yy?GGl3QT?dR14@NXT2suW^`NRk8BUY4)`Vxw|!9p)@Nu90^r(`i1D=<-<3ri
z?I~6UIWG8i_M+kM!pqRW&#6qKG>lO9Py!Mwd(p%m{IN$QDF_l8RhQM6TP1G~d9A|C
zTz?j0Wrqb8Ro4TS<2---RMZ5^98-ZQzOlW|SN9<Z>YCWFCK9kifh4uvFo()RQ6~OU
zw7Iy~XW>5ll=ub3p4L~=tRFFHGzBZ8Y3?t;++PpFEkg*nqIM$f6~`(@6{LFz+!EjH
zK_QlZuUDWbmZzZZJMjAcEaIpDs6O~a_9FN9qTiz+84hHmf(`WGk8KE_I~misxySR*
zeJ%o80U1GaYml&(sg~7rh~}8cs1*dpi(6+~DWrqas8v0Rs<v%;)w9iUzCJHP;&X8s
z4<Jd5?`pq^3irgKCN-M$QVZf-oDMP+TZ_V3gEiLAa5N4<TO{Df_*k#*c9rq(gg18L
zH`8h^T2xe9FR%5tEjT}f^Iu*6Chz`}wJ#`r41A0`I5c3_<iOF*SO8QwYWJKtjUF4{
zM!Pj~J7Eu+VgU&CP>6mLC|$*JlMs{88H-?$`BZx!*L!y}S4$b0^CWtso6NV<qKg}y
zfqJ9MG>&-IY>%5|ylDYU<{E@3JWKMh55%!6K+EdZLRF?TiA^1#9dV(L4dcN848H^=
zGarVX1%0J0wrN~#1*H37j(}UsIDj(FHpaCiJju_G^&W$5d*P>IfHA_yYNON(0LKR#
z2yjvZQ0xEX&#5p@p_DxV4YC*&99jgs$Y(EvGZzHA7JDw$nXMo&G)i3XOv2L~TLN+y
z;2ZE0Q3pUUav{t`$h0C{3W}TZSK`BU>n^U*EH4tkzH{nM0Z??XX<;MC!ySs|Q3m4H
z1z1`s(_H{q|Ao-wVekhE!hEcndEk5w_t~2LGJMp(Hy^e72;l%4-p_s=hBkF<%k(WV
zezg$uJp%LJ@|F;;_`CX&>+d)601bW*ShZ1LN8tI~+(3XE8^E4#gmB%%VJ47~YP(0q
zlX@vCmNuryN+EGLK|==6aITl02Qh(jFw{EZMoCMavHS!guGFNn|Jx=v;<)ZVg=pNq
z>~`!{gr7oWKSb)9iXven#?0T9i!McOrL2y`I<ynpJy9A&AsquhE{;2Z^)`&~maZF`
z`NWe|;y>_ThW89_@{gbDdn~Lw$PgtTFpI<e@d5lKi&Ad^Jd78Oncozi2sDBOOVR!k
zf{pCMoN~Xj>7qtKOWVNO#%0Jj3c<O#Vz*WS)o`oi5LKLGCm5_mB1BE3pqfEo`~7xa
z0~tE5mEzyUP_2SxnBFqW%Cj%EPG%G|NpD%(#g+7&Y5V;MY_+}L`V&u)?s-27AUFz5
zp7m%&LHl#S<#7atOX}-+Wi(yQh2<UD$>9C7%u2kd=Eb;6By^C+a|OT^<C2U%prX6K
zTuuzew?X}swGyo`K;@z~=J^N+aJj0c`C){SLLnl|e?UYKznWV6(LC<2?#Fzek5#Y$
zK@zP3m|CJdQ^9=#k%$ry5G|{OczIFs)>+iY*O?pF*Pp>16G@o9ISLM3hoQDV4y|X}
zUL4gzWO^R-K-6>L6^9xF2mqCxd7W4Yk1#~o4r8Yr2x8tI3|cGzj5UVGsqq-NWXxUd
zjR%4ry1i1l+UMi0L4b`~vz!Le7(MkFUG#F{m~;V+JI+kPeh_pw`Dzx~=uyJvCcD;|
zr)990Jm(<@JOf}bV(6ZH$JL?xAYx#m<MEXA_N@Zm{>u9KOhSKbH#B_^I+vCYv!Xs%
z+k==Wl);q>a`9ql8W!|05RrFEfl<Yy@?rt`&R)T))~(#YOGQAkaNtJ%nU^=YFowrR
z1pwoJN(*?;a=^m|jw98IGpC+ZKqm?cdz?ckesiSSSS+15kGhB=Jxa+#L5_hu>z*+`
zm+{(rt%AFE6PI{84Lz;3D+t6;KGfndKUV&ml=SwO+RSb}lOXaK>~j#>-Gj9%<7X#Z
z)oXk4l4<+_l%I!zIHRDpuT_xsyrl)EW89O24ng;2{oz>vCs3wyu$uE>IlSJB9{`3z
z_SD2fBn3Dt2jY$<8e0lrCQ7sti52Alag=nK=YAiE5t)vEqLV9PxmqIE@fm4cDB%f0
z9uKw*<jkR7X9@wPc;&M`iulTITKG*(FQFRT0h$qd3&l;FBr@VKgsU_503l=iM%!gG
z;piHbrAyH2ShBMj+spW%tQ4He){o|gLyH7}tm3Rf`O)C=xV&z~#|Qu|fVzQ-QL5n5
z$Kt~Vs}ZU2acg5%G%^Jwp4mvQrO2ZNs}Ta2{O%}p?9)h^MyDZZ{N(r%(&>Tjs~Kvf
zlsO&ah*p>QYGsu&Cy^WBu9<c_uD8rGQ24rR*0lgL9b`f`xoz1-UqE{i$Bozqq}Wfr
zFt2L}7NM0@R3}MuG6ktsL>f1)X-P;Ec@#b^1}i&S;n52o%B}ev+Xs{N(vUJ$D`6hQ
z^LdotggTR96^N`LaEb3qkQTQ9Q>Jyzb7qNrzabDm6j^J`WO>f8Um48>J}F>7xEH^y
zzw+|3EFF$1ohYIEjew^5JFnmles24YL35j7emA={6tLhTDl~-QQpWG@Sy>8GYH^FF
zW{_kbx*oxqu_Em+%FKFT6?MXqSPb&`sO?cJ01!=ql5(I2Q+wV8HA!>g*+88j+cD)&
zf`n#?!iI=IHTo*771#Q^)lxy=^Lo%4&`Ed*=~}84>l072d)x0C{}f_yQI0`;wge=M
zCmttspix|U+;UkhRq!+<qM?3Qas*Zx*9SD-{&6SY;^ojSjK31C-=o%VK(}SJ7pHzf
zc>6B!nvh=HFTAuYSKrIyjIk>Kg;b#G6hO2U!16~GysUX8at>hy_zA04gn(JuAQv1B
z()hFR4Xj_w@PILt$0>`@F=I_4i?Ept;?xZ)NvHCF4>t~FM_;vq7xAwJ5myp=QvlXY
z!IV_Df_$dKt9m+<4&#svE$=DUXQQxIqb&QO(MYE)G)jER{93ld@T&J^V3g%om&Nk^
zc6fc%j2smJ)s(js=HPx~k2|`@g2;~^hXOb*Hrfc7Hjdl#qvyVn_H=Bx>E|pws2;lD
zsVh#S#5`%ay8-o<MiVPHr|JE|4nDw?Xi6K!Sv$@K)Wr3q<P^{ca43;jQ}XM`I3U((
z=mCPD({m8#-ikFLoRcxW8lPYvv;yMX?_zG+fxvfG%#JSsErcQXB255{;c*9fweSsY
zURnEK%=LmxkZw<Ax4;L`E@R#_zOgY!umtYHJY0@hv`AuV5jhc|q;-h10%^OH{9NyT
ziZc92$m|yLZCvxtdMXX5T|Jm1Wtd}}xCD5D&9K}-{4sv9s0@s2Jhv^U5N%n0o%miF
zbZF`lYiSR%i(m4Y&5H@22LJ+{iQmQWVi!tNrI(6Fm4at==%*`(Z`MY!3ANx$eE~T@
zB`^GKOhc`J$dNhmY96bl{mhZyME7Hfek0xEjwF~?z+&TAiw;m2xY#J}&}t>KX_7`l
zaOaprR|cA-anlgOcFZuTEMbH9rgxK1%F=J^sDsk87gB^&gW{&eBk50NGa#P5p-);h
z^=+_xpe1pLiw(P(T67fN1oJyg_ArnOe%0K{08JdeUK^=i0NljO@76{bcfI)jA)NP8
z>Imbdg2Z7%oxk@R>23#(U;U+cpSKF)-!PLw{HXo?h)D`#i4PCvn8^4)!>4&EOop(y
zQiaJLon%eWMpo8z`<*E5*{?FbCjf0>CTr8CinScIeiKX~sxNvy0%>&$2ERw7pVUSi
zBLHv?sA4`?8=>L#ef}7e{y_-81dUXif!2T$YxfR@1y}?-FgBtf00~XBxi-t0e+^<h
z?>|NRi;OwM_dw26hAl)ORBpex%P$2aUY&t{KqN(2rQ|u&;)u_T@wb7eyHJSx(EMt$
zX8mOH-}>i-aIaUUV(EkMINIbMusUOaWrAyYprpX%UI6=k5W<(9K!<Oovti(-fd)Hv
zai^er13c}9%V<*ZxC>xFhSH?=aLN>r=^eALK^wS&%)TBykX5ACYG%8>7SY}-p!rou
zo7X9Z5l-n2lg>>kUZ;S}?>M=?3CkfR-{~ADYIg8@1iU(x=w=~@#tqjJBs5g!Uwq#X
z9`R|k|H^~}VGoh+hXBCTxCiX@aexqb8xP9|6@D@J`e!lr+uouOuxJpXAeaNO&0A1$
zA`Kx>T0IzKTx+KB1M#u}Ybh?WY1mcc8>bd=E;<STUVD173>l3(h(?tjSB!z<$mB4!
zUhC-5;@L^kOPB^dy>ZDZ?-PqBQ6*mwVOhI0-k!1l#F_o!)?jz`-3`;bXCONGc-AV&
z{EO}r!b3j<&zzr{%iqBKuda{Wuv@EEfDXY`^$Et^@A@)sh3DhMK(M4Zi^gS8^B^vG
zfav^Ix?6zfV-lc5pvEXCYGTg(OEtz;d?sp(rp;*thY)|N*GU{)6L$<{jJc`gUG`t$
z;)h$8A=DLQXA;<7di32I?99#RK_J{mOJD-Xh?@U?AzbzTW|&{k;sfiozqQEiS%koA
zc>fk7mBIo13g3DOWBTjSI7+tMY)i9>5u#?|%EJ*RdN4>@iJcWoBJu1$KSwoDf;pWn
zlMGg41$ge`Ocw&S)v>*2UL3-z+K=EKcM!+~Iz#0s4%a=C9sPfwYW8OZ{u<2x*6gPC
z*J?Ke1OUAjK0$b7!wB^ckM9fN`@b2&?eF9b6>Z5cP@gIJC=4`D`>4*mxdcqw0TA8F
zDM9twcjCPauh!oV^G|oM>vT#g1m@FkGffD{J(-Dr6cb-<@8|8^BqR?s>(tBZZ--%X
zz~4o$3*qA5Bj299n2nx)k%v!208qK{kShR;iV(PfA2TQ)3Yb9H{J{`@@OmWREfv$X
zT}X&l0Z3J_j1dM1hne6nknwG!A2;=D$1?>K0-D5rirqy*QJ_!b#@SDX6VTmJ7&Bvf
z54BNCXYbq5M4ho@8J0abgv;K<Z<;QyDO%J2kuX2Z*cfO@F(3qfiWvV%h@<i1*wwc>
z1#kS55bpRVOd<qNoRcmsDk<w2X;c^>3EeC0kxI^eN6SEp6Kni>IUPU-ajsvJUxJc|
zW)fumuowZLPe5PMbP)u0h5g}*(|DvE{p6D8!vr<G+PR%fZ@*{e{s5Mf3w$p44Ti(u
zt~p>F5}pV1|94TcBuYaFyo^j3cqA<VzNfss-(uOppJ5ez7(~I8?G^-<lnDdJUwt?Y
z2N^2pfgV6O^wK<+X(|w6ynCwF5mz$)N$Rb4csI<S_bIB5sB9Z5VIR0jY5B5>gV&jf
zP8$};5gEg~7ymeQb`zFNf5ZiB%#O5M(~qK-vOMs`m9}TUkX5&@WypK)(YH-pfY|UV
znExGRX>4v1AOsLa;jM(^G@}`B`$PygzZnhp`N<Om78bP%NYrN#hYAP;Kl0$ZcsI55
zP<8k%%&H26(_tluz06s30AkvQx$k~&p?S<!(oLuTv5)8<k9Au_a1+<*^5?|9i2tA+
zH9WLqI0D0a(9E%86^1{7){oPw)wu1(cc$?@6?pYiVf^1>70!cNEky?Sb(mjXErq5?
z4k7STa&(OVHg~f?y|#Zggj?UvzJQM-=|Hj?uw6caM%9U0UwlxGM=cxr7|HZ25cusw
z8fYa%%S2O-sCGsMCT$w3<(ba>1lt%1jO$|%(?JO9AdWtdJ`-9o`87Rb+UmcUM^<3v
zRr~+bwoJ&{%&e)cID8|X!Hc^(JrUPE2mSQN95Njh(A6>~(vXOfDqf;yH&4nvK{<Z;
z$o_DhMaJwNT_4T*YU5hzo7DO`#<v^$=dpO}8GPRBRWSa8Q&TsJhtvwMNzwdS;-)DE
zA@C#;|1)7#mPymUE+D!M>tyRkLHwQyqqIUq%@!Pqmqh;Bv1($c8ha52o_^!r_kJt}
zhYeoq!FKqd;D~+&d&_kEjB>Zv#Ft(o5I1fIB@(n0hDjrFNoYr5c#n#ns0k^7F1w0+
z_&u_PJeh$RVHlz8%IXp8$bI%3>(=1j=UJL8PVK$d=3$rvo_f&Q{Y$Z`&Vq3-!DsOk
z1o~%r2fkW?Oi0@ot_pPs=eG|O(h6v;gHc66dxhAhtYOv4#vlKY)}Po1q}Qa*G`r^e
zF@i%a`@uQ&Y}I3=htbTvm?WTb+2$cP@vMDT2<QGH4$}Khw7i{U@Cz{iN1Dkf!(uaO
z;Ga|UeDn!Cs;Pwakv(R7!@pO;{u?VDH`>|tktge(4v}E|=Wk|_xH3`fxRlmo&K<96
z2F8_j5SH<xX6Km<>2^@RbKn9HtsBoXlp!M_5~p1zq(pn(&xLU1KaeM7jWC2bM&Ijr
zJ-m_M&D<kYJs~!Ry~I&CST{oaRm*W5+dw-#>oG1F=GPNtdl$ldO9(K!!LQB^t%o@w
z*#4C~TSUC75jaf#Uwt;#-RD{t0-!0vsC5D2Wk?0DeIU-Y(zFchi&$6tfOpY+=DB^~
ziMZlFT3cMpX!;o9O2=y4Orx8&B1ycMe8@cdG$Sm2AY=R&RhU0*6Hi(FzD$Hy!u&rd
zi&+0YqM`m_$~{5=N(sMDz*7;RoC$ygy6uw)g7;KDZg32ZX3gR{>6WWFfeM)v!WsSX
zI;#iQ)mN1W)&KFe|Eh6E@5jWNzzzAT&CY!>A9s2OxPcK;A!<2=lz}|_jnauqMj;?G
zNtyi0T%&DGEa2F#%4*PUpP9H27*k{w(ebB>SMghYJSOTWzO$FM%^30Dn5a{CLtQd$
zKe&Ra(8pKy?XS3|x&F(qur}sj6aEn9$3xjXOc*-|f+tYypAoKXzW4?y4&K7{$PaOn
z&<F62f2(4qS<BNZ3>wsPDj!A0r9c;gT>2-_v`pGz;1_Cd_@xl;{CE0_4vPY%!|FIC
z)u~~6PtRJ)P;Kq=DhKHdu5D)xZXmPYf$={vkxVBn04y9KfEOeDGC^wqo3RbxhKW62
zM-cp1#c6o(=8Do%QEB>~DWJ*keDGy&4&j^^cXq4ovzy)&!jJz5!Oa+ZD*QSffsq=f
z*B7$jQk;5E=LoxJG1GCscGr;cU4R(E-@^Qu&J%;S2!N~#2!ac_{$0Xz33zeZ1S1C#
z24AbJC)mzVK`Q}EHd=Qx)iO})+s0LotuTN438_t;(;yo^Qu#e}!QSs81nj4oD6r`q
z{R{j>2dGT%ae1qs#uy)CxcqqT28b(sghUPRBm5bRkLR^L&{hGERS7}xV6K0Iz}&`|
zkB?+gTL0Bwup7%@+s9Z}@HxEDH{#L7;T8#vY@_KpDa~rV&dsFE7=PY5m4zVZy^J<Z
z%*E17@okIi%*4%aVYcV*n5ozm-3!xO5XkH@eHBB!bMf3?#<(5<z3PcorpH;?esG-6
zm^i!{#)t6R3zNzz2!cma-B%NyLTH~wkm2AYLeYaswtog;K-m8yyx`mJ>7*zWGRcr7
z>crm~6LRjlimC4@FMjA@y2HU+L%0P&u<^YJ0<2fNV_FOiYWl=Ja28^KS&&7SSDbk(
z9?dl9MU!dWsPR8acpHrWMXN9wFsT9{t1yD#{#?JD@EpRT?0MUNoeEP3=sPed9FA<P
zOwU=Et&Kq{oRX$nY7SHhOpPLOExrN|@eA1Q`TS02e8wiD_Et{Y8a!4I-1Q#LzQc;$
zj|EHfE4g$?U&PrF>SVnHKj}4ePGt<g<SG{EJfOnxPK!=1+)eg4zQQ|T{BKWgQB%wX
z2!iuz(2EE^O}JpP8yECU>pa{G9`}5sGF&)(7xUzpq|VdZ*`q?i>&YTU3d$C|WCZ3q
zgZa2jG$Geqi|h2EFhpOgC_&1FR}l^nG#i!^R#t=83kYUz&=Sk8d$^XN)i(9v2+wpl
zVN%7ggI@zuHkR>E{9>*&2@c`>j=lRsc7C2U_IF^hRqblgZ--3E3_`UJ>~#J{gJrtT
z!1WC>{DS+?7Y~N<8OP5%58*sSged_WTj9Nge}(apmQw?g|C9rQU?GJ*f$+10#}Eby
zQvme*vE5h&n_!ijD%QgOA6J};$DnNm#tEiVB@k&c!%p^V4QsEx&%iVpWY_*0Un1*;
z2M5<PZj``ac7w!L{3+)w$D&_Mp74B#26Lwvz`Hfc!W^y*3IKyGK1n`RZse{2$m=p*
z{|Dicf@sCB^b_p+K8K!u8Bz1Kie=zJrjbLH+GO}Mms-5nUe}gTi}&og&up;^fZS2n
zV?V5}Y_hR0=3iV9#I9`Yojl_-t4`goLiV$S_Ygh-?jgXbg*;SFSuG%>`%;(|!LteX
zp0a{#3vyN5@x2w{FuW510TH^yRzYEKwVN{$hA1FL5EAfATv7}V%|WToMvHi8gWg|P
z(haR6oLl{c@m)pdJaBux!ylm;FNbib?9>B&N29_&5T4SO(q(+{eU#>CsU6zD@12!r
z5%(A=eqGlUfBJtk+qD*Az@q9wTl)>mIfVXY6@2@}9LMf5cwE(p2`{6c-nA4rgRl1!
z-kT!0+k)GiOisH1$m#_#&FA_Ngy$33!Y6Q9mgS;#|AK;b)_vlu_v<a5I>T-H@>L<~
zvTFnCe{dg~@8ocJxv=*aX(4^TzyHMc$NBTQ%+W5{>=3`>$aT$QIw_}Ts;6=oK1|D1
zZTA<t=l;X!@3VrpTMjP?&eGOxSs^i@^|+u2Tfd6W&Rn;dbGhy>Oxn-NSi5bqmdAYC
zX}|V#Hkfbb4&_O(YSR}?`E~w6`Y(SgE~_AhD@h!OtY%ktzJA%#E<8=(oXdeq!R<-=
zw+3%<uT{Sg9#y#C?}sSUwr`T0-!4z*{O|ljr?xa)J$d0{RtMcr3L8rA*w6Z)a>$wU
z-uaRXhF{*d>~ETpH<zz>>HKrc=dD-$!{fNn;4)*n>yCLJI1fF4vAp;4(u0@17_KOZ
zDn+09%ghnQ9xhn$oXy7GnC*j0-#PaGDOTIM>^Aww`Bj)P>fh;Mv^o0Ft;y%V<e}|}
zRnOlqm-UHe4Cv47@Lm3JX-j)Z^W~6i&dZzy&BudhT>UY>WrfO?O#Uxti=;lfG>IKl
z<|yX8%sR&-M52EFhft=^Qy;c}Icrw8-1dwFW59i3fxANUVqJFFJ0>nLvzor3IWW9-
zdBeSj;QI)94h3yK6R{v(Z($+d!*yym%(>qAH42Bg9NRbf0Mo=1U#w#Dm&BL6IBPLu
zBh!LFf5#fDWizJmiwnK;zK~?mzo0EB!C<-h$_Ev3ZTnjl694{xJ7=R8v!jTsbl~bK
zljJvi6aMr4Ph-a9V@^3rN)cU}Mduzl2{30iO|azG+uXu(XlMTWb17|WE-;_{dd?!|
zVY~M>>yrLEwmV$e4l&K$bi;X$pQY8|DmVWZf@?2w*Zpw5Ctw=F@aL(9Z^gma_x_ll
zP;KU25wlr}C(p5HQ+n2^Y`6RN-KSs9|6fyd=GUWJY@2rO*=YVk^k$6N?KZZ@D&Ajb
zGsK;^E;(yqj>qeRLXsj?i4F0J=>?m=9J&QO%B$_IdQt8=algmB@k{0NUoh+cxwFBv
zhC$}BO|PbUI{S3{#|s%Y%-ol`s@`_a;c|=9huUlZwKE-9zWdMDgI}vf8#!DQm;^f<
jSd;`vWInjV@rGg9=CkTyyRYSfZuj(b^>bP0l+XkKpBP0x

literal 0
HcmV?d00001

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 215e6cba6c..5c593eacf4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -61,6 +61,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     quickstart
     glossary
     pretrained_models
+    usage
     model_sharing
     examples
     notebooks
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000000..8fb7a44727
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,597 @@
+Usage
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This page shows the most frequent use-cases when using the library. The models available allow for many different
+configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
+for tasks such as question answering, sequence classification, named entity recognition and others.
+
+These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
+automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
+for more information.
+Feel free to modify the code to be more specific and adapt it to your specific use-case.
+
+In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
+checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
+following:
+
+- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
+  one of the `run_$TASK.py` script in the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
+- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
+  and domain. As mentioned previously, you may leverage the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
+  may create your own training script.
+
+In order to do an inference on a task, several mechanisms are made available by the library:
+
+- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
+- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
+  but much more powerful.
+
+Both approaches are showcased here.
+
+.. note::
+
+    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
+    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
+    additional head that is used for the task, initializing the weights of that head randomly.
+
+    This would produce random output.
+
+Sequence Classification
+--------------------------
+
+Sequence classification is the task of classifying sequences according to a given number of classes. An example
+of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a GLUE sequence classification task, you may leverage the
+`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_glue.py>`_ or
+`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_tf_glue.py>`_ scripts.
+
+Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
+It leverages a fine-tuned model on sst2, which is a GLUE task.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("sentiment-analysis")
+
+    print(nlp("I hate you"))
+    print(nlp("I love you"))
+
+This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
+
+::
+
+    [{'label': 'NEGATIVE', 'score': 0.9991129}]
+    [{'label': 'POSITIVE', 'score': 0.99986565}]
+
+
+Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
+of each other. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Build a sequence from the two sentences, with the correct model-specific separators token type ids
+  and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
+  :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
+- Pass this sequence through the model so that it is classified in one of the two available classes: 0
+  (not a paraphrase) and 1 (is a paraphrase)
+- Compute the softmax of the result to get probabilities over the classes
+- Print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    classes = ["not paraphrase", "is paraphrase"]
+
+    sequence_0 = "The company HuggingFace is based in New York City"
+    sequence_1 = "Apples are especially bad for your health"
+    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
+    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
+
+    paraphrase_classification_logits = model(**paraphrase)[0]
+    not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+
+    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+    not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+
+    print("Should be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+    print("\nShould not be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+    ## TENSORFLOW CODE
+    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    classes = ["not paraphrase", "is paraphrase"]
+
+    sequence_0 = "The company HuggingFace is based in New York City"
+    sequence_1 = "Apples are especially bad for your health"
+    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
+    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
+
+    paraphrase_classification_logits = model(paraphrase)[0]
+    not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+    not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+    print("Should be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+    print("\nShould not be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+
+This outputs the following results:
+
+::
+
+    Should be paraphrase
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    Should not be paraphrase
+    not paraphrase: 94%
+    is paraphrase: 6%
+
+Extractive Question Answering
+----------------------------------------------------
+
+Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a SQuAD task, you may leverage the `run_squad.py`.
+
+Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
+It leverages a fine-tuned model on SQuAD.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("question-answering")
+
+    context = r"""
+    Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+    question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+    a model on a SQuAD task, you may leverage the `run_squad.py`.
+    """
+
+    print(nlp(question="What is extractive question answering?", context=context))
+    print(nlp(question="What is a good example of a question answering dataset?", context=context))
+
+This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
+are the positions of the extracted answer in the text.
+
+::
+
+    {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
+    {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
+
+
+Here is an example of question answering using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Define a text and a few questions.
+- Iterate over the questions and build a sequence from the text and the current question, with the correct
+  model-specific separators token type ids and attention masks
+- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
+  text), for both the start and end positions.
+- Compute the softmax of the result to get probabilities over the tokens
+- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
+- Print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    text = r"""
+    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    TensorFlow 2.0 and PyTorch.
+    """
+
+    questions = [
+        "How many pretrained models are available in Transformers?",
+        "What does Transformers provide?",
+        "Transformers provides interoperability between which frameworks?",
+    ]
+
+    for question in questions:
+        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
+        input_ids = inputs["input_ids"].tolist()[0]
+
+        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer_start_scores, answer_end_scores = model(**inputs)
+
+        answer_start = torch.argmax(
+            answer_start_scores
+        )  # Get the most likely beginning of answer with the argmax of the score
+        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+
+        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+    ## TENSORFLOW CODE
+    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    text = r"""
+    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    TensorFlow 2.0 and PyTorch.
+    """
+
+    questions = [
+        "How many pretrained models are available in Transformers?",
+        "What does Transformers provide?",
+        "Transformers provides interoperability between which frameworks?",
+    ]
+
+    for question in questions:
+        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
+        input_ids = inputs["input_ids"].numpy()[0]
+
+        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer_start_scores, answer_end_scores = model(inputs)
+
+        answer_start = tf.argmax(
+            answer_start_scores, axis=1
+        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
+        answer_end = (
+            tf.argmax(answer_end_scores, axis=1) + 1
+        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+
+This outputs the questions followed by the predicted answers:
+
+::
+
+    Question: How many pretrained models are available in Transformers?
+    Answer: over 32 +
+
+    Question: What does Transformers provide?
+    Answer: general - purpose architectures
+
+    Question: Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+
+
+
+Language Modeling
+----------------------------------------------------
+
+Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
+based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
+causal language modeling.
+
+Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
+or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
+
+Masked Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
+fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
+right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
+for downstream tasks requiring bi-directional context such as SQuAD (question answering,
+see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
+
+Here is an example of using pipelines to replace a mask from a sequence:
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("fill-mask")
+    print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+
+This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
+vocabulary:
+
+::
+
+    [
+        {'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
+        {'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
+        {'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
+        {'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
+        {'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
+    ]
+
+Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
+  loads it with the weights stored in the checkpoint.
+- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
+- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
+- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
+  values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
+  context.
+- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
+- Replace the mask token by the tokens and print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    input = tokenizer.encode(sequence, return_tensors="pt")
+    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+
+    token_logits = model(input)[0]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+
+    for token in top_5_tokens:
+        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    input = tokenizer.encode(sequence, return_tensors="tf")
+    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+
+    token_logits = model(input)[0]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+
+    top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+    for token in top_5_tokens:
+        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+
+This prints five sequences, with the top 5 tokens predicted by the model:
+
+::
+
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+Causal Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
+model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
+for generation tasks.
+
+There is currently no pipeline to do causal language modeling/generation.
+
+Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method
+to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+
+    input = tokenizer.encode(sequence, return_tensors="pt")
+    generated = model.generate(input, max_length=50)
+
+    resulting_string = tokenizer.decode(generated.tolist()[0])
+    print(resulting_string)
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+    generated = tokenizer.encode(sequence)
+
+    for i in range(50):
+        predictions = model(tf.constant([generated]))[0]
+        token = tf.argmax(predictions[0], axis=1)[-1].numpy()
+        generated += [token]
+
+    resulting_string = tokenizer.decode(generated)
+    print(resulting_string)
+
+
+This outputs a (hopefully) coherent string from the original sequence, as the
+:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution:
+
+::
+
+    Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John
+    Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by!
+
+
+Named Entity Recognition
+----------------------------------------------------
+
+Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
+token as a person, an organisation or a location.
+An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
+If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
+`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
+
+Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
+of 9 classes:
+
+- O, Outside of a named entity
+- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
+- I-MIS, Miscellaneous entity
+- B-PER, Beginning of a person's name right after another person's name
+- I-PER, Person's name
+- B-ORG, Beginning of an organisation right after another organisation
+- I-ORG, Organisation
+- B-LOC, Beginning of a location right after another location
+- I-LOC, Location
+
+It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
+`dbmdz <https://github.com/dbmdz>`__.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("ner")
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge which is visible from the window."
+
+    print(nlp(sequence))
+
+This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
+expected results:
+
+::
+
+    [
+        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
+        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
+        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
+        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
+        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
+        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
+        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
+        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
+        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
+        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
+        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
+        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
+    ]
+
+Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
+"Manhattan Bridge" have been identified as locations.
+
+Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
+  loads it with the weights stored in the checkpoint.
+- Define the label list with which the model was trained on.
+- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
+  encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
+- Encode that sequence into IDs (special tokens are added automatically).
+- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
+  distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
+  for each token.
+- Zip together each token with its prediction and print it.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelForTokenClassification, AutoTokenizer
+    import torch
+
+    model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    label_list = [
+        "O",       # Outside of a named entity
+        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+        "I-MISC",  # Miscellaneous entity
+        "B-PER",   # Beginning of a person's name right after another person's name
+        "I-PER",   # Person's name
+        "B-ORG",   # Beginning of an organisation right after another organisation
+        "I-ORG",   # Organisation
+        "B-LOC",   # Beginning of a location right after another location
+        "I-LOC"    # Location
+    ]
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge."
+
+    # Bit of a hack to get the tokens with the special tokens
+    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    inputs = tokenizer.encode(sequence, return_tensors="pt")
+
+    outputs = model(inputs)[0]
+    predictions = torch.argmax(outputs, dim=2)
+
+    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+    import tensorflow as tf
+
+    model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    label_list = [
+        "O",       # Outside of a named entity
+        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+        "I-MISC",  # Miscellaneous entity
+        "B-PER",   # Beginning of a person's name right after another person's name
+        "I-PER",   # Person's name
+        "B-ORG",   # Beginning of an organisation right after another organisation
+        "I-ORG",   # Organisation
+        "B-LOC",   # Beginning of a location right after another location
+        "I-LOC"    # Location
+    ]
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge."
+
+    # Bit of a hack to get the tokens with the special tokens
+    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    inputs = tokenizer.encode(sequence, return_tensors="tf")
+
+    outputs = model(inputs)[0]
+    predictions = tf.argmax(outputs, axis=2)
+
+    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
+
+This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
+a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
+following array should be the output:
+
+::
+
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]

From 7a7ee28cb9b930018e9ca49d18c0445fb14badd6 Mon Sep 17 00:00:00 2001
From: Jhuo IH <41447049+autoih@users.noreply.github.com>
Date: Tue, 25 Feb 2020 11:06:57 -0800
Subject: [PATCH 05/80] missing ner link (#2967)

---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index 8621fc75d9..17b4c2f7f6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -22,7 +22,7 @@ pip install -r ./examples/requirements.txt
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
 | [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. |
-| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
+| [Named Entity Recognition](https://github.com/huggingface/transformers/tree/master/examples/ner) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
 | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
 | [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
 

From e8ce63ff2163259276fc0a4a2f35b836fe9f4aa0 Mon Sep 17 00:00:00 2001
From: srush <sasha.rush@gmail.com>
Date: Tue, 25 Feb 2020 14:47:43 -0500
Subject: [PATCH 06/80] Change masking to direct labeling for TPU support.
 (#2982)

* change masking to direct labelings

* fix black

* switch to ignore index

* .

* fix black
---
 src/transformers/modeling_bert.py       | 6 ++++--
 src/transformers/modeling_distilbert.py | 6 ++++--
 src/transformers/modeling_roberta.py    | 7 +++++--
 src/transformers/modeling_xlnet.py      | 6 ++++--
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 902af3111b..400ecb33ff 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -1382,8 +1382,10 @@ class BertForTokenClassification(BertPreTrainedModel):
             # Only keep active parts of the loss
             if attention_mask is not None:
                 active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
                 loss = loss_fct(active_logits, active_labels)
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index 787346a71d..ce715034ac 100644
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -818,8 +818,10 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
             # Only keep active parts of the loss
             if attention_mask is not None:
                 active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
                 loss = loss_fct(active_logits, active_labels)
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index e4cde61a33..0fdc1a255f 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -542,13 +542,16 @@ class RobertaForTokenClassification(BertPreTrainedModel):
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             # Only keep active parts of the loss
             if attention_mask is not None:
                 active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
                 loss = loss_fct(active_logits, active_labels)
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index 6ac65f4f1d..531b0f9a4c 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -1264,8 +1264,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
             # Only keep active parts of the loss
             if attention_mask is not None:
                 active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
                 loss = loss_fct(active_logits, active_labels)
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

From c913eb9c3894b4031dc059d22b42e38a5fcef989 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Feb 2020 22:51:25 +0100
Subject: [PATCH 07/80] Add integration tests for xlm roberta modelling and xlm
 roberta tokenzier (#3014)

* add first files

* add xlm roberta integration tests

* make style

* flake 8 issues solved
---
 tests/test_modeling_xlm_roberta.py     |  68 +++++++++++++++
 tests/test_tokenization_xlm_roberta.py | 111 +++++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 tests/test_modeling_xlm_roberta.py
 create mode 100644 tests/test_tokenization_xlm_roberta.py

diff --git a/tests/test_modeling_xlm_roberta.py b/tests/test_modeling_xlm_roberta.py
new file mode 100644
index 0000000000..8c6bd0069b
--- /dev/null
+++ b/tests/test_modeling_xlm_roberta.py
@@ -0,0 +1,68 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+
+from .utils import slow
+
+
+if is_torch_available():
+    import torch
+    from transformers import XLMRobertaModel
+
+
+class XLMRobertaModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_xlm_roberta_base(self):
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
+        input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze(
+            0
+        )  # The dog is cute and lives in the garden house
+
+        expected_output_shape = torch.Size((1, 12, 768))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]
+        ).unsqueeze(0)
+        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
+        #  xlmr.eval()
+        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
+
+        output = model(input_ids)[0].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+    @slow
+    def test_xlm_roberta_large(self):
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
+        input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze(
+            0
+        )  # The dog is cute and lives in the garden house
+
+        expected_output_shape = torch.Size((1, 12, 1024))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]
+        ).unsqueeze(0)
+        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
+        #  xlmr.eval()
+        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
+
+        output = model(input_ids)[0].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py
new file mode 100644
index 0000000000..bf1169c8ab
--- /dev/null
+++ b/tests/test_tokenization_xlm_roberta.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
+
+from .utils import slow
+
+
+class XLMRobertaTokenizationIntegrationTest(unittest.TestCase):
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [0, 35378, 6661, 38, 2]
+        # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')  # xlmr.large has same tokenizer
+        # xlmr.eval()
+        # xlmr.encode(symbols)
+
+        self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenization_base_hard_symbols(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+
+        symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
+        original_tokenizer_encodings = [
+            0,
+            3293,
+            83,
+            10,
+            4552,
+            4989,
+            7986,
+            678,
+            10,
+            5915,
+            111,
+            179459,
+            124850,
+            4,
+            6044,
+            237,
+            12,
+            6,
+            5,
+            6,
+            4,
+            6780,
+            705,
+            15,
+            1388,
+            44,
+            378,
+            10114,
+            711,
+            152,
+            20,
+            6,
+            5,
+            22376,
+            642,
+            1221,
+            15190,
+            34153,
+            450,
+            5608,
+            959,
+            1119,
+            57702,
+            136,
+            186,
+            47,
+            1098,
+            29367,
+            47,
+            4426,
+            3678,
+            2740,
+            4,
+            6044,
+            237,
+            6284,
+            50901,
+            528,
+            31,
+            90,
+            34,
+            927,
+            2,
+        ]
+        # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')  # xlmr.large has same tokenizer
+        # xlmr.eval()
+        # xlmr.encode(symbols)
+
+        self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))

From bb7c46852051f7d031dd4be0240c9c9db82f6ed9 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Tue, 25 Feb 2020 18:43:36 -0500
Subject: [PATCH 08/80] Documentation (#2989)

* All Tokenizers

BertTokenizer + few fixes
RobertaTokenizer
OpenAIGPTTokenizer + Fixes
GPT2Tokenizer + fixes
TransfoXLTokenizer
Correct rst for TransformerXL
XLMTokenizer + fixes
XLNet Tokenizer + Style
DistilBERT + Fix XLNet RST
CTRLTokenizer
CamemBERT Tokenizer
FlaubertTokenizer
XLMRobertaTokenizer
cleanup

* cleanup
---
 docs/source/model_doc/albert.rst             |   3 +-
 docs/source/model_doc/bert.rst               |   3 +-
 docs/source/model_doc/camembert.rst          |   3 +-
 docs/source/model_doc/ctrl.rst               |   2 +-
 docs/source/model_doc/gpt.rst                |   2 +-
 docs/source/model_doc/gpt2.rst               |   4 +-
 docs/source/model_doc/roberta.rst            |   3 +-
 docs/source/model_doc/transformerxl.rst      |   2 +-
 docs/source/model_doc/xlm.rst                |   3 +-
 docs/source/model_doc/xlmroberta.rst         |   3 +-
 docs/source/model_doc/xlnet.rst              |   3 +-
 src/transformers/configuration_flaubert.py   |  11 +-
 src/transformers/configuration_gpt2.py       |  11 +-
 src/transformers/configuration_openai.py     |  11 +-
 src/transformers/configuration_xlm.py        |  11 +-
 src/transformers/modeling_bert.py            |   2 +-
 src/transformers/modeling_tf_bert.py         |  53 ++++----
 src/transformers/tokenization_albert.py      | 123 +++++++++++++++---
 src/transformers/tokenization_bert.py        | 127 ++++++++++++------
 src/transformers/tokenization_camembert.py   | 120 ++++++++++++++---
 src/transformers/tokenization_ctrl.py        |  28 +++-
 src/transformers/tokenization_distilbert.py  |  13 +-
 src/transformers/tokenization_flaubert.py    |  14 +-
 src/transformers/tokenization_gpt2.py        |  45 ++++++-
 src/transformers/tokenization_openai.py      |  28 +++-
 src/transformers/tokenization_roberta.py     | 109 +++++++++++++---
 src/transformers/tokenization_transfo_xl.py  |  14 +-
 src/transformers/tokenization_xlm.py         | 130 +++++++++++++++----
 src/transformers/tokenization_xlm_roberta.py | 114 +++++++++++++---
 src/transformers/tokenization_xlnet.py       | 113 +++++++++++++---
 30 files changed, 866 insertions(+), 242 deletions(-)

diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst
index 06a9b5bfd5..0740fd5b98 100644
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -41,7 +41,8 @@ AlbertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.AlbertTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 
 AlbertModel
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 5e785eed1c..b66189493f 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -46,7 +46,8 @@ BertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.BertTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 
 BertModel
diff --git a/docs/source/model_doc/camembert.rst b/docs/source/model_doc/camembert.rst
index 611d930d6e..5db97c4402 100644
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -33,7 +33,8 @@ CamembertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.CamembertTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 
 CamembertModel
diff --git a/docs/source/model_doc/ctrl.rst b/docs/source/model_doc/ctrl.rst
index a8a04837d7..8d8ca57bbd 100644
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -43,7 +43,7 @@ CTRLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.CTRLTokenizer
-    :members:
+    :members: save_vocabulary
 
 
 CTRLModel
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 9604b39cea..0b693cd928 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -47,7 +47,7 @@ OpenAIGPTTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.OpenAIGPTTokenizer
-    :members:
+    :members: save_vocabulary
 
 
 OpenAIGPTModel
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index 54ef3cea08..e86713f40f 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -5,7 +5,7 @@ Overview
 ~~~~~~~~~~~~~~~~~~~~~
 
 OpenAI GPT-2 model was proposed in
-`Language Models are Unsupervised Multitask Learners`_
+`Language Models are Unsupervised Multitask Learners <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_
 by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
 corpus of ~40 GB of text data.
@@ -46,7 +46,7 @@ GPT2Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.GPT2Tokenizer
-    :members:
+    :members: save_vocabulary
 
 
 GPT2Model
diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst
index 62138bb72e..92438abd91 100644
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -39,7 +39,8 @@ RobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.RobertaTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 
 RobertaModel
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index 5240df3df4..84951f8a6d 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -42,7 +42,7 @@ TransfoXLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.TransfoXLTokenizer
-    :members:
+    :members: save_vocabulary
 
 
 TransfoXLModel
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 7346693752..b65c4f4dbd 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -41,7 +41,8 @@ XLMTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.XLMTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 XLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/model_doc/xlmroberta.rst b/docs/source/model_doc/xlmroberta.rst
index 6e5bcce7c9..38735696ef 100644
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -39,7 +39,8 @@ XLMRobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.XLMRobertaTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 
 XLMRobertaModel
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index 0f8c61098c..b768e6ec75 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -44,7 +44,8 @@ XLNetTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.XLNetTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
 
 
 XLNetModel
diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py
index 5110330819..0c9860cbed 100644
--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -109,11 +109,12 @@ class FlaubertConfig(XLMConfig):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.XLMForSequenceClassification`.
                 Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
+
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.XLMForSequenceClassification`.
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index 7fff0b6c49..ed639dc18c 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -73,11 +73,12 @@ class GPT2Config(PretrainedConfig):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.GPT2DoubleHeadsModel`.
                 Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
+
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.GPT2DoubleHeadsModel`.
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index d4a965bde1..528558144a 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -73,11 +73,12 @@ class OpenAIGPTConfig(PretrainedConfig):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                 Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
+
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index c4d61808d6..36b3bd0711 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -108,11 +108,12 @@ class XLMConfig(PretrainedConfig):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.XLMForSequenceClassification`.
                 Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
+
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.XLMForSequenceClassification`.
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 400ecb33ff..47d7b2301f 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -1230,7 +1230,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification loss.
         classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
             `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index 01bc1c2be7..1904623581 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -668,38 +668,39 @@ class TFBertModel(TFBertPreTrainedModel):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
     def call(self, inputs, **kwargs):
         r"""
-        Returns:
+    Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-            last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-                Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during Bert pretraining. This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-                tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-                tuple of :obj:`tf.Tensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
 
-        Examples::
 
-            import tensorflow as tf
-            from transformers import BertTokenizer, TFBertModel
+    Examples::
 
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = TFBertModel.from_pretrained('bert-base-uncased')
-            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-            outputs = model(input_ids)
-            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertModel
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertModel.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         outputs = self.bert(inputs, **kwargs)
         return outputs
diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py
index 224636c997..e85efbb6a9 100644
--- a/src/transformers/tokenization_albert.py
+++ b/src/transformers/tokenization_albert.py
@@ -19,6 +19,7 @@ import logging
 import os
 import unicodedata
 from shutil import copyfile
+from typing import List, Optional
 
 from .tokenization_utils import PreTrainedTokenizer
 
@@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁"
 
 class AlbertTokenizer(PreTrainedTokenizer):
     """
-        SentencePiece based tokenizer. Peculiarities:
+    Constructs an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
 
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer):
         return self.sp_model.IdToPiece(index)
 
     def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
         out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
         An ALBERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer):
             return cls + token_ids_0 + sep
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
                 raise ValueError(
                     "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                 )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
@@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An ALBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
 
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py
index de74ee579c..159600a37c 100644
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -19,6 +19,7 @@ import collections
 import logging
 import os
 import unicodedata
+from typing import List, Optional
 
 from tokenizers import BertWordPieceTokenizer
 
@@ -117,17 +118,41 @@ def whitespace_tokenize(text):
 
 class BertTokenizer(PreTrainedTokenizer):
     r"""
-    Constructs a BertTokenizer.
-    :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+    Constructs a BERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
 
     Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_basic_tokenize=True
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to do basic tokenization before WordPiece.
+        never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            List of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese:
+            see: https://github.com/huggingface/transformers/issues/328
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer):
         tokenize_chinese_chars=True,
         **kwargs
     ):
-        """Constructs a BertTokenizer.
-
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input
-                Only has an effect when do_basic_tokenize=True
-            **do_basic_tokenize**: (`optional`) boolean (default True)
-                Whether to do basic tokenization before wordpiece.
-            **never_split**: (`optional`) list of string
-                List of tokens which will never be split during tokenization.
-                Only has an effect when do_basic_tokenize=True
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be deactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
         super().__init__(
             unk_token=unk_token,
             sep_token=sep_token,
@@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer):
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
         A BERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer):
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         A BERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
 
         if token_ids_1 is None, only returns the first portion of the mask (0's).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer):
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
         index = 0
         if os.path.isdir(vocab_path):
             vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py
index a158419470..cc4fdc650b 100644
--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -18,6 +18,7 @@
 import logging
 import os
 from shutil import copyfile
+from typing import List, Optional
 
 import sentencepiece as spm
 
@@ -53,7 +54,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
         Adapted from RobertaTokenizer and XLNetTokenizer
         SentencePiece based tokenizer. Peculiarities:
 
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
         self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+        A CamemBERT sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
+
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         cls = [self.cls_token_id]
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
@@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A RoBERTa sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
+        A CamemBERT sequence pair mask has the following format:
 
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence  | | second sequence |
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
         return out_string
 
     def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py
index 691824b92b..5c487952c4 100644
--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -116,8 +116,21 @@ def get_pairs(word):
 
 class CTRLTokenizer(PreTrainedTokenizer):
     """
-    CTRL BPE tokenizer. Peculiarities:
-        - Byte-Pair-Encoding
+    Constructs a CTRL tokenizer. Peculiarities:
+
+    - Byte-Pair-Encoding
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
         return out_string
 
     def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
diff --git a/src/transformers/tokenization_distilbert.py b/src/transformers/tokenization_distilbert.py
index 8ca0a531d3..7a5bf34367 100644
--- a/src/transformers/tokenization_distilbert.py
+++ b/src/transformers/tokenization_distilbert.py
@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = {
 class DistilBertTokenizer(BertTokenizer):
     r"""
     Constructs a DistilBertTokenizer.
-    :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
 
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_basic_tokenize=True
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/tokenization_flaubert.py b/src/transformers/tokenization_flaubert.py
index e648a61c94..dd0115b0cd 100644
--- a/src/transformers/tokenization_flaubert.py
+++ b/src/transformers/tokenization_flaubert.py
@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer):
     """
     BPE tokenizer for Flaubert
 
-        - Moses preprocessing & tokenization
+    - Moses preprocessing & tokenization
+    - Normalize all inputs text
+    - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+      (ex: "__classify__") to a vocabulary
+    - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
 
-        - Normalize all inputs text
-
-        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
-        (ex: "__classify__") to a vocabulary
-
-        - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
+    This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
+    and documentation regarding arguments.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py
index 961797b97a..45331445b1 100644
--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -101,11 +101,35 @@ def get_pairs(word):
 class GPT2Tokenizer(PreTrainedTokenizer):
     """
     GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding and tokenize methods should be called with the
-          ``add_prefix_space`` flag set to ``True``.
-          Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
-          the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
+
+    - Byte-level Byte-Pair-Encoding
+    - Requires a space to start the input string => the encoding methods should be called with the
+      ``add_prefix_space`` flag set to ``True``.
+      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+      the absence of a space at the beginning of a string:
+
+    ::
+
+        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to "replace"):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
+            The end of sequence token.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         return text
 
     def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py
index 912ab852a7..ea0f52a806 100644
--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -82,8 +82,21 @@ def text_standardize(text):
 class OpenAIGPTTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer. Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+
+    - lower case all inputs
+    - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return out_string
 
     def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py
index fda82fb307..7275ceb4ca 100644
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -16,6 +16,7 @@
 
 
 import logging
+from typing import List, Optional
 
 from tokenizers.processors import RobertaProcessing
 
@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class RobertaTokenizer(GPT2Tokenizer):
     """
-    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding methods should be called with the
-          ``add_prefix_space`` flag set to ``True``.
-          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
+    Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
+
+    - Byte-level Byte-Pair-Encoding
+    - Requires a space to start the input string => the encoding methods should be called with the
+      ``add_prefix_space`` flag set to ``True``.
+      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+      the absence of a space at the beginning of a string:
+
+    ::
+
+        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to "replace"):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer):
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
         A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer):
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer):
             return [1] + ([0] * len(token_ids_0)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
 
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py
index c09cc5acd6..a26d9b6371 100644
--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin"
 class TransfoXLTokenizer(PreTrainedTokenizer):
     """
     Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             raise ValueError("No <unkown> token in vocabulary")
 
     def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
 
         logger.warning(
             "Please note you will not be able to load the save vocabulary in"
diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py
index 93b8092abc..5afe1d29f0 100644
--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -21,6 +21,7 @@ import os
 import re
 import sys
 import unicodedata
+from typing import List, Optional
 
 import sacremoses as sm
 
@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer for XLM
 
-        - Moses preprocessing & tokenization for most supported languages
+    - Moses preprocessing & tokenization for most supported languages
+    - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
+    - (optionally) lower case & normalize all inputs text
+    - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+      (ex: "__classify__") to a vocabulary
+    - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
+    - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
 
-        - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
 
-        - (optionally) lower case & normalize all inputs text
+    Args:
+        vocab_file (:obj:`string`):
+            Vocabulary file.
+        merges_file (:obj:`string`):
+            Merges file.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
 
-        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
-        (ex: "__classify__") to a vocabulary
+            .. note::
 
-        - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
-
-        - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
-
-        - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "<special1>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+            List of additional special tokens.
+        lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`):
+            Dictionary mapping language IDs to their string identifiers.
+        do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase and remove accents when tokenizing.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer):
         out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
         A XLM sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s> B </s>
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An XLM sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
 
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer):
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
     def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py
index 1e903d8a2b..810ef6c4a7 100644
--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -18,6 +18,7 @@
 import logging
 import os
 from shutil import copyfile
+from typing import List, Optional
 
 from transformers.tokenization_utils import PreTrainedTokenizer
 
@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         Adapted from RobertaTokenizer and XLNetTokenizer
         SentencePiece based tokenizer. Peculiarities:
 
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+        A XLM-R sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
+
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         cls = [self.cls_token_id]
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
+
         if already_has_special_tokens:
             if token_ids_1 is not None:
                 raise ValueError(
@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        XLM-R does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+
         """
+
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
 
@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         return out_string
 
     def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py
index 30fdfda22e..800ef09c99 100644
--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -19,6 +19,7 @@ import logging
 import os
 import unicodedata
 from shutil import copyfile
+from typing import List, Optional
 
 from .tokenization_utils import PreTrainedTokenizer
 
@@ -51,9 +52,57 @@ SEG_ID_PAD = 4
 
 class XLNetTokenizer(PreTrainedTokenizer):
     """
-        SentencePiece based tokenizer. Peculiarities:
+    Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
 
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "<sep>"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "<cls>"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer):
         out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
         by concatenating and adding special tokens.
         An XLNet sequence has the following format:
-            single sequence: X <sep> <cls>
-            pair of sequences: A <sep> B <sep> <cls>
+
+        - single sequence: ``X <sep> <cls>``
+        - pair of sequences: ``A <sep> B <sep> <cls>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer):
             return token_ids_0 + sep + cls
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
         Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
             return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
         return ([0] * len(token_ids_0)) + [1, 1]
 
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An XLNet sequence pair mask has the following format:
@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
         | first sequence    | second sequence     | CLS segment ID
 
         if token_ids_1 is None, only returns the first portion of the mask (0's).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
         """
         sep = [self.sep_token_id]
         cls_segment_id = [2]
@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
         return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
 
     def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

From 9df74b8bc42eedc496f7148b9370728054ca3b6a Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Wed, 26 Feb 2020 11:36:27 -0500
Subject: [PATCH 09/80] Delete all mentions of Model2Model (#3019)

---
 docs/source/quickstart.md                    | 93 --------------------
 src/transformers/__init__.py                 |  2 +-
 src/transformers/modeling_encoder_decoder.py | 59 -------------
 tests/test_modeling_encoder_decoder.py       | 50 -----------
 4 files changed, 1 insertion(+), 203 deletions(-)
 delete mode 100644 tests/test_modeling_encoder_decoder.py

diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 438f78ebcf..30a4401244 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -220,96 +220,3 @@ print(sequence)
 ```
 
 The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
-
-### Model2Model example
-
-Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
-
-```python
-import torch
-from transformers import BertTokenizer, Model2Model
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Encode the input to the encoder (the question)
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-
-# Encode the input to the decoder (the answer)
-answer = "Jim Henson was a puppeteer"
-encoded_answer = tokenizer.encode(answer)
-
-# Convert inputs to PyTorch tensors
-question_tensor = torch.tensor([encoded_question])
-answer_tensor = torch.tensor([encoded_answer])
-```
-
-Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
-
-```python
-# In order to compute the loss we need to provide language model
-# labels (the token ids that the model should have produced) to
-# the decoder.
-lm_labels =  encoded_answer
-labels_tensor = torch.tensor([lm_labels])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = question_tensor.to('cuda')
-answer_tensor = answer_tensor.to('cuda')
-labels_tensor = labels_tensor.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the value of the LM loss 
-    lm_loss = outputs[0]
-```
-
-This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
-
-```python
-# Let's re-use the previous question
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-question_tensor = torch.tensor([encoded_question])
-
-# This time we try to generate the answer, so we start with an empty sequence
-answer = "[CLS]"
-encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
-answer_tensor = torch.tensor([encoded_answer])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('fine-tuned-weights')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = question_tensor.to('cuda')
-answer_tensor = answer_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(question_tensor, answer_tensor)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'jim'
-predicted_index = torch.argmax(predictions[0, -1]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'jim'
-```
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ac283ff7c8..ad6869f4c4 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -241,7 +241,7 @@ if is_torch_available():
         CamembertForTokenClassification,
         CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
-    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_encoder_decoder import PreTrainedEncoderDecoder
     from .modeling_t5 import (
         T5PreTrainedModel,
         T5Model,
diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py
index 4c5603b217..649d1e858f 100644
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -234,62 +234,3 @@ class PreTrainedEncoderDecoder(nn.Module):
         decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
-
-
-class Model2Model(PreTrainedEncoderDecoder):
-    r"""
-        :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
-        where both of the encoder and decoder are of the same family. If the
-        name of or that path to a pretrained model is specified the encoder and
-        the decoder will be initialized with the pretrained weight (the
-        cross-attention will be intialized randomly if its weights are not
-        present).
-
-        It is possible to override this behavior and initialize, say, the decoder randomly
-        by creating it beforehand as follows
-
-            config = BertConfig.from_pretrained()
-            decoder = BertForMaskedLM(config)
-            model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Tying the encoder and decoders' embeddings together.
-
-       We need for each to get down to the embedding weights. However the
-        different model classes are inconsistent to that respect:
-        - BertModel: embeddings.word_embeddings
-        - RoBERTa: embeddings.word_embeddings
-        - XLMModel: embeddings
-        - GPT2: wte
-        - BertForMaskedLM: bert.embeddings.word_embeddings
-        - RobertaForMaskedLM: roberta.embeddings.word_embeddings
-
-        argument of the XEmbedding layer for each model, but it is "blocked"
-        by a model-specific keyword (bert, )...
-        """
-        # self._tie_or_clone_weights(self.encoder, self.decoder)
-        pass
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-
-        if (
-            "bert" not in pretrained_model_name_or_path
-            or "roberta" in pretrained_model_name_or_path
-            or "distilbert" in pretrained_model_name_or_path
-        ):
-            raise ValueError("Only the Bert model is currently supported.")
-
-        model = super().from_pretrained(
-            encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-            decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-            *args,
-            **kwargs,
-        )
-
-        return model
diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py
deleted file mode 100644
index ac01e7b561..0000000000
--- a/tests/test_modeling_encoder_decoder.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Hugging Face Inc. Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import require_torch, slow
-
-
-if is_torch_available():
-    from transformers import BertModel, BertForMaskedLM, Model2Model
-    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class EncoderDecoderModelTest(unittest.TestCase):
-    @slow
-    def test_model2model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = Model2Model.from_pretrained(model_name)
-            self.assertIsInstance(model.encoder, BertModel)
-            self.assertIsInstance(model.decoder, BertForMaskedLM)
-            self.assertEqual(model.decoder.config.is_decoder, True)
-            self.assertEqual(model.encoder.config.is_decoder, False)
-
-    def test_model2model_from_pretrained_not_bert(self):
-        logging.basicConfig(level=logging.INFO)
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("roberta")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("distilbert")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("does-not-exist")

From 9cda3620b69e47f585fe3d27ba93783150c2ecdc Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 26 Feb 2020 11:59:25 -0500
Subject: [PATCH 10/80] Fix (non-slow) tests on GPU (torch) (#3024)

* Fix tests on GPU (torch)

* Fix bart slow tests

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
---
 src/transformers/modeling_bart.py |  2 +-
 tests/test_modeling_bart.py       | 30 ++++++++++++++++++++----------
 tests/test_modeling_common.py     |  3 ++-
 tests/test_modeling_t5.py         |  4 +++-
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index f329eb6842..b0188d4d88 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -86,7 +86,7 @@ def _prepare_bart_decoder_inputs(
             causal_lm_mask = None
         new_shape = (bsz, tgt_len, tgt_len)
         # make it broadcastable so can just be added to the attention coefficients
-        decoder_attn_mask = _combine_masks(decoder_padding_mask, causal_lm_mask, new_shape)
+        decoder_attn_mask = _combine_masks(decoder_padding_mask, causal_lm_mask, new_shape).to(device=input_ids.device)
     assert decoder_attn_mask is None or decoder_attn_mask.shape == (bsz, 1, tgt_len, tgt_len)
     return decoder_input_ids, decoder_attn_mask
 
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 927c37eadf..ef8932618a 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -172,7 +172,7 @@ class BartHeadTests(unittest.TestCase):
     vocab_size = 99
 
     def test_lm_forward(self):
-        input_ids = torch.Tensor(
+        input_ids = torch.tensor(
             [
                 [71, 82, 18, 33, 46, 91, 2],
                 [68, 34, 26, 58, 30, 82, 2],
@@ -187,8 +187,10 @@ class BartHeadTests(unittest.TestCase):
                 [21, 5, 62, 28, 14, 76, 2],
                 [45, 98, 37, 86, 59, 48, 2],
                 [70, 70, 50, 9, 28, 0, 2],
-            ]
-        ).long()
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
         batch_size = input_ids.shape[0]
         decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
 
@@ -204,12 +206,14 @@ class BartHeadTests(unittest.TestCase):
             max_position_embeddings=48,
         )
         model = BartForSequenceClassification(config)
+        model.to(torch_device)
         outputs = model.forward(input_ids=input_ids, decoder_input_ids=input_ids)
         logits = outputs[0]
         expected_shape = torch.Size((batch_size, config.num_labels))
         self.assertEqual(logits.shape, expected_shape)
 
         lm_model = BartForMaskedLM(config)
+        lm_model.to(torch_device)
         loss, logits, enc_features = lm_model.forward(
             input_ids=input_ids, lm_labels=decoder_lm_labels, decoder_input_ids=input_ids
         )
@@ -292,6 +296,10 @@ def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
         raise AssertionError(msg)
 
 
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device,)
+
+
 TOLERANCE = 1e-4
 
 
@@ -299,15 +307,15 @@ TOLERANCE = 1e-4
 class BartModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head(self):
-        model = BartModel.from_pretrained("bart-large")
-        input_ids = torch.Tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]).long()
+        model = BartModel.from_pretrained("bart-large").to(torch_device)
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
         with torch.no_grad():
             output = model.forward(**inputs_dict)[0]
         expected_shape = torch.Size((1, 11, 1024))
         self.assertEqual(output.shape, expected_shape)
         expected_slice = torch.Tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]]
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -315,20 +323,22 @@ class BartModelIntegrationTest(unittest.TestCase):
     def test_mnli_inference(self):
 
         example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
-        input_ids = torch.Tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b]).long()
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
 
-        model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli")  # eval called in from_pre
+        model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli").to(
+            torch_device
+        )  # eval called in from_pre
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
         # Test that model hasn't changed
         with torch.no_grad():
             batched_logits, features = model.forward(**inputs_dict)
         expected_shape = torch.Size((2, 3))
         self.assertEqual(batched_logits.shape, expected_shape)
-        expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]])
+        expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]]).to(torch_device)
         logits_arr = batched_logits[0].detach()
 
         # Test that padding does not change results
-        input_ids_no_pad = torch.Tensor([example_b[:-1]]).long()
+        input_ids_no_pad = _long_tensor([example_b[:-1]])
 
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids=input_ids_no_pad)
         with torch.no_grad():
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 669a494cf2..4e5202a65a 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -68,7 +68,7 @@ class ModelTesterMixin:
             model.eval()
             with torch.no_grad():
                 outputs = model(**inputs_dict)
-            out_2 = outputs[0].numpy()
+            out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -472,6 +472,7 @@ class ModelTesterMixin:
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
             model = model_class(config)
+            model.to(torch_device)
 
             model_vocab_size = config.vocab_size
             # Retrieve the embeddings and clone theme
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index d62ba2bd79..1d7738b64b 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -20,7 +20,7 @@ from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 if is_torch_available():
@@ -125,6 +125,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
             decoder_lm_labels,
         ):
             model = T5Model(config=config)
+            model.to(torch_device)
             model.eval()
             decoder_output, encoder_output = model(
                 encoder_input_ids=encoder_input_ids,
@@ -157,6 +158,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
             decoder_lm_labels,
         ):
             model = T5WithLMHeadModel(config=config)
+            model.to(torch_device)
             model.eval()
             outputs = model(
                 encoder_input_ids=encoder_input_ids,

From fdd61b19928e87a5354c36923182e801bfedb31b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 26 Feb 2020 18:04:37 +0100
Subject: [PATCH 11/80] Fix attn mask gpt2 when using past (#3033)

* fix issue and add some tests

* fix issue and add some tests

* updated doc string gpt2
---
 src/transformers/modeling_gpt2.py | 12 +++--
 tests/test_modeling_common.py!    |  0
 tests/test_modeling_gpt2.py       | 74 +++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_modeling_common.py!

diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index b72d11af92..479f459d2c 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -276,14 +276,17 @@ GPT2_START_DOCSTRING = r"""
 
 GPT2_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length if `past` is None else 1
             Indices of input sequence tokens in the vocabulary.
+            If using `past` as an input make sure that `input_ids` are those of the last position.
 
             Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
             See :func:`transformers.PreTrainedTokenizer.encode` and
             :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
 
             `What are input IDs? <../glossary.html#input-ids>`__
+
         past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
             Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
@@ -294,10 +297,12 @@ GPT2_INPUTS_DOCSTRING = r"""
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 
             `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`, defaults to :obj:`None`):
+            `input_ids_length` = `sequence_length if `past` is None else 1
             Segment token indices to indicate first and second portions of the inputs.
             Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
             corresponds to a `sentence B` token
+            If using `past` as an input make sure that `token_type_ids` correspond to the `input_ids` of the last position.
 
             `What are token type IDs? <../glossary.html#token-type-ids>`_
         position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
@@ -419,7 +424,8 @@ class GPT2Model(GPT2PreTrainedModel):
 
         # Attention mask.
         if attention_mask is not None:
-            attention_mask = attention_mask.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+            attention_mask = attention_mask.view(batch_size, -1)
             # We create a 3D attention mask from a 2D tensor mask.
             # Sizes are [batch_size, 1, 1, to_seq_length]
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
diff --git a/tests/test_modeling_common.py! b/tests/test_modeling_common.py!
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 2f6f1dfdbb..3a8a9c541a 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -170,6 +170,72 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
             )
             self.parent.assertEqual(len(result["presents"]), config.n_layer)
 
+        def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = GPT2Model(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            # first forward pass
+            output, past = model(input_ids, token_type_ids=token_type_ids)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+            next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+            # append to next input_ids and token_type_ids
+            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+            next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+            output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
+            output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
+
+            # select random slice
+            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+            # test that outputs are equal for slice
+            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        def create_and_check_gpt2_model_attention_mask_past(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+        ):
+            model = GPT2Model(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            # create attention mask
+            attn_mask = torch.ones(input_ids.shape).long()
+            half_seq_length = self.seq_length // 2
+            attn_mask[:, half_seq_length:] = 0
+
+            # first forward pass
+            output, past = model(input_ids, attention_mask=attn_mask)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+            # change a random masked slice from input_ids
+            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+            random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+            input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+            # append to next input_ids and attn_mask
+            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+            attn_mask = torch.cat([attn_mask, torch.ones((attn_mask.shape[0], 1)).long()], dim=1)
+
+            # get two different outputs
+            output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
+            output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
+
+            # select random slice
+            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+            # test that outputs are equal for slice
+            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2LMHeadModel(config)
             model.to(torch_device)
@@ -248,6 +314,14 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
 
+    def test_gpt2_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
+
+    def test_gpt2_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
+
     def test_gpt2_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)

From 5bc99e7f33c83b23b88740877283098ef7964b73 Mon Sep 17 00:00:00 2001
From: Andrew Walker <awalker88@me.com>
Date: Wed, 26 Feb 2020 11:39:54 -0600
Subject: [PATCH 12/80] fix several typos in Distil* readme (#3034)

---
 examples/distillation/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 42732389a5..930edf94fd 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -10,7 +10,7 @@ This folder contains the original code used to train Distil* as well as examples
 
 **October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
 
-**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
+**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper supersedes our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
 
 **September 19, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 
@@ -39,7 +39,7 @@ Here are the results on the dev sets of GLUE:
 | RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
 | DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
 
-<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
+<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directly perform transfer learning on the pre-trained DistilRoBERTa.
 
 <sup>2</sup> Macro-score computed without WNLI.
 
@@ -65,9 +65,9 @@ This part of the library has only be tested with Python3.6+. There are few speci
 Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
-- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 79.8 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 82.3 F1 score).
+- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 79.8 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 82.3 F1 score).
 - `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters.
-- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
+- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
 - `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
 - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
 - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
@@ -111,7 +111,7 @@ python scripts/binarized_data.py \
     --dump_file data/binarized_text
 ```
 
-Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurences of each tokens in the data:
+Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:
 
 ```bash
 python scripts/token_counts.py \

From f5516805c2d0ea39797d46d9433dab43f769bea1 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 26 Feb 2020 20:47:49 +0000
Subject: [PATCH 13/80] Fix bart slow test

---
 tests/test_modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index ef8932618a..29b459fd8d 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -314,7 +314,7 @@ class BartModelIntegrationTest(unittest.TestCase):
             output = model.forward(**inputs_dict)[0]
         expected_shape = torch.Size((1, 11, 1024))
         self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.Tensor(
+        expected_slice = torch.tensor(
             [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))

From b370cc7e99c5b8c7436154d4694c33b461ea0f08 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 26 Feb 2020 21:48:49 +0000
Subject: [PATCH 14/80] [gpu] Fixup fdd61b19928e87a5354c36923182e801bfedb31b

---
 tests/test_modeling_gpt2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 3a8a9c541a..21fc873234 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -205,7 +205,7 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
             model.eval()
 
             # create attention mask
-            attn_mask = torch.ones(input_ids.shape).long()
+            attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
             half_seq_length = self.seq_length // 2
             attn_mask[:, half_seq_length:] = 0
 
@@ -222,7 +222,9 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
 
             # append to next input_ids and attn_mask
             next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-            attn_mask = torch.cat([attn_mask, torch.ones((attn_mask.shape[0], 1)).long()], dim=1)
+            attn_mask = torch.cat(
+                [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1
+            )
 
             # get two different outputs
             output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)

From 9495d38b0db435e55f05d581e41b273119c2bd68 Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Wed, 26 Feb 2020 23:36:39 +0100
Subject: [PATCH 15/80] Changes from reviews.

---
 examples/ner/run_ner.py             | 10 +---------
 src/transformers/modeling_albert.py |  2 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index 08330dba7f..442fa97109 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -468,23 +468,15 @@ def main():
     parser.add_argument(
         "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
     )
-
     parser.add_argument(
         "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents."
     )
-
     parser.add_argument(
         "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents."
     )
-
     parser.add_argument(
-        "--nouse_fast",
-        action="store_const",
-        dest="use_fast",
-        const=False,
-        help="Set this flag to not use fast tokenization.",
+        "--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization."
     )
-
     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
     parser.add_argument(
         "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index dbaa50f565..2a1269b4a9 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -600,7 +600,7 @@ class AlbertMLMHead(nn.Module):
         hidden_states = self.LayerNorm(hidden_states)
         hidden_states = self.decoder(hidden_states)
 
-        prediction_scores = hidden_states + self.bias
+        prediction_scores = hidden_states
 
         return prediction_scores
 

From d762d4289cc0c60e484d6c4976cfe17da0c24870 Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Wed, 26 Feb 2020 23:50:40 +0100
Subject: [PATCH 16/80] Code now passes style enforcement

---
 examples/ner/run_ner.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index 442fa97109..73b43aee97 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -474,9 +474,7 @@ def main():
     parser.add_argument(
         "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents."
     )
-    parser.add_argument(
-        "--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization."
-    )
+    parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.")
     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
     parser.add_argument(
         "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."

From aceb6a0907eff20100f7e136c18aa4b8e1453efa Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Thu, 27 Feb 2020 11:52:46 +0100
Subject: [PATCH 17/80] Added test for AlbertForTokenClassification

---
 tests/test_modeling_albert.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index 05d7aaefb5..389887c47c 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -207,6 +207,25 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
             self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
+        def create_and_check_albert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = AlbertForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
+            self.check_loss_output(result)
+
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (

From f71157529e865f0195f226c9a5648d9a3f3e0f41 Mon Sep 17 00:00:00 2001
From: Martin Malmsten <martin.malmsten@kb.se>
Date: Thu, 27 Feb 2020 12:24:20 +0100
Subject: [PATCH 18/80] Added test for AlbertForTokenClassification

---
 tests/test_modeling_albert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index 389887c47c..b1fe4bb64e 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -29,6 +29,7 @@ if is_torch_available():
         AlbertModel,
         AlbertForMaskedLM,
         AlbertForSequenceClassification,
+        AlbertForTokenClassification,
         AlbertForQuestionAnswering,
     )
     from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP

From f4ff44a6d9a4646ccd88388efeba7c9d673f4290 Mon Sep 17 00:00:00 2001
From: Cola <43774355+Colanim@users.noreply.github.com>
Date: Thu, 27 Feb 2020 23:56:47 +0900
Subject: [PATCH 19/80] Fix batch_encode_plus (#3041)

---
 src/transformers/tokenization_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 901952798e..e2ae20c7da 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -1126,8 +1126,7 @@ class PreTrainedTokenizer(object):
 
         input_ids = []
         for ids_or_pair_ids in batch_text_or_text_pairs:
-            if isinstance(ids_or_pair_ids, (list, tuple)):
-                assert len(ids_or_pair_ids) == 2
+            if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
                 ids, pair_ids = ids_or_pair_ids
             else:
                 ids, pair_ids = ids_or_pair_ids, None

From 6a3758804131196b804ca6abf94c90988763a00f Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Thu, 27 Feb 2020 10:22:35 -0500
Subject: [PATCH 20/80] spelling: strictly (#3042)

---
 src/transformers/modeling_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c48bcec17d..7c61e7fdc7 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -626,7 +626,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
 
             temperature: (`optional`) float
-                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
+                The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
 
             top_k: (`optional`) int
                 The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
@@ -714,10 +714,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         if isinstance(eos_token_ids, int):
             eos_token_ids = [eos_token_ids]
 
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
         assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
-        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
-        assert temperature > 0, "`temperature` should be strictely positive."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
+        assert temperature > 0, "`temperature` should be strictly positive."
         assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
         assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
         assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
@@ -730,10 +730,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         assert (eos_token_ids is None) or (
             isinstance(eos_token_ids, (list, tuple)) and ((isinstance(e, int) and e >= 0) for e in eos_token_ids)
         ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
-        assert length_penalty > 0, "`length_penalty` should be strictely positive."
+        assert length_penalty > 0, "`length_penalty` should be strictly positive."
         assert (
             isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictely positive integer."
+        ), "`num_return_sequences` should be a strictly positive integer."
 
         if input_ids is None:
             assert isinstance(bos_token_id, int) and bos_token_id >= 0, (

From 908fa43b543cf52a3238129624f502240725a6a6 Mon Sep 17 00:00:00 2001
From: srush <sasha.rush@gmail.com>
Date: Thu, 27 Feb 2020 16:45:33 -0500
Subject: [PATCH 21/80] Changes to NER examples for PLT and TPU (#3053)

* changes to allow for tpu training

* black

* tpu

* tpu
---
 examples/ner/run_pl.sh           |  16 +++-
 examples/ner/run_pl_ner.py       | 124 ++++++++++++++-----------------
 examples/ner/transformer_base.py |  59 +++++++++------
 3 files changed, 104 insertions(+), 95 deletions(-)

diff --git a/examples/ner/run_pl.sh b/examples/ner/run_pl.sh
index 2cf8a8cfec..8165286dd9 100644
--- a/examples/ner/run_pl.sh
+++ b/examples/ner/run_pl.sh
@@ -1,6 +1,20 @@
-# Require pytorch-lightning=0.6
+# Install newest ptl.
+pip install -U git+http://github.com/PyTorchLightning/pytorch-lightning/
+
+
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
+ wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
 export MAX_LENGTH=128
 export BERT_MODEL=bert-base-multilingual-cased
+python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
+python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
+python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
+cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
 export OUTPUT_DIR=germeval-model
 export BATCH_SIZE=32
 export NUM_EPOCHS=3
diff --git a/examples/ner/run_pl_ner.py b/examples/ner/run_pl_ner.py
index 0ba4e1248a..1747045e43 100644
--- a/examples/ner/run_pl_ner.py
+++ b/examples/ner/run_pl_ner.py
@@ -7,8 +7,7 @@ import numpy as np
 import torch
 from seqeval.metrics import f1_score, precision_score, recall_score
 from torch.nn import CrossEntropyLoss
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader, TensorDataset
 
 from transformer_base import BaseTransformer, add_generic_args, generic_train
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
@@ -25,13 +24,14 @@ class NERTransformer(BaseTransformer):
     def __init__(self, hparams):
         self.labels = get_labels(hparams.labels)
         num_labels = len(self.labels)
+        self.pad_token_label_id = CrossEntropyLoss().ignore_index
         super(NERTransformer, self).__init__(hparams, num_labels)
 
     def forward(self, **inputs):
         return self.model(**inputs)
 
     def training_step(self, batch, batch_num):
-        "Compute loss"
+        "Compute loss and log."
         inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
         if self.hparams.model_type != "distilbert":
             inputs["token_type_ids"] = (
@@ -40,25 +40,61 @@ class NERTransformer(BaseTransformer):
 
         outputs = self.forward(**inputs)
         loss = outputs[0]
-
         tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
         return {"loss": loss, "log": tensorboard_logs}
 
+    def _feature_file(self, mode):
+        return os.path.join(
+            self.hparams.data_dir,
+            "cached_{}_{}_{}".format(
+                mode,
+                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
+                str(self.hparams.max_seq_length),
+            ),
+        )
+
+    def prepare_data(self):
+        "Called to initialize data. Use the call to construct features"
+        args = self.hparams
+        for mode in ["train", "dev", "test"]:
+            cached_features_file = self._feature_file(mode)
+            if not os.path.exists(cached_features_file):
+                logger.info("Creating features from dataset file at %s", args.data_dir)
+                examples = read_examples_from_file(args.data_dir, mode)
+                features = convert_examples_to_features(
+                    examples,
+                    self.labels,
+                    args.max_seq_length,
+                    self.tokenizer,
+                    cls_token_at_end=bool(args.model_type in ["xlnet"]),
+                    cls_token=self.tokenizer.cls_token,
+                    cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+                    sep_token=self.tokenizer.sep_token,
+                    sep_token_extra=bool(args.model_type in ["roberta"]),
+                    pad_on_left=bool(args.model_type in ["xlnet"]),
+                    pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
+                    pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+                    pad_token_label_id=self.pad_token_label_id,
+                )
+                logger.info("Saving features into cached file %s", cached_features_file)
+                torch.save(features, cached_features_file)
+
     def load_dataset(self, mode, batch_size):
-        labels = get_labels(self.hparams.labels)
-        self.pad_token_label_id = CrossEntropyLoss().ignore_index
-        dataset = self.load_and_cache_examples(labels, self.pad_token_label_id, mode)
-        if mode == "train":
-            if self.hparams.n_gpu > 1:
-                sampler = DistributedSampler(dataset)
-            else:
-                sampler = RandomSampler(dataset)
-        else:
-            sampler = SequentialSampler(dataset)
-        dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
-        return dataloader
+        "Load datasets. Called after prepare data."
+        cached_features_file = self._feature_file(mode)
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+        return DataLoader(
+            TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids), batch_size=batch_size
+        )
 
     def validation_step(self, batch, batch_nb):
+        "Compute validation"
+
         inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
         if self.hparams.model_type != "distilbert":
             inputs["token_type_ids"] = (
@@ -68,11 +104,10 @@ class NERTransformer(BaseTransformer):
         tmp_eval_loss, logits = outputs[:2]
         preds = logits.detach().cpu().numpy()
         out_label_ids = inputs["labels"].detach().cpu().numpy()
-
-        return {"val_loss": tmp_eval_loss, "pred": preds, "target": out_label_ids}
+        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
 
     def _eval_end(self, outputs):
-        "Task specific validation"
+        "Evaluation called for both Val and Test"
         val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
         preds = np.concatenate([x["pred"] for x in outputs], axis=0)
         preds = np.argmax(preds, axis=2)
@@ -96,7 +131,6 @@ class NERTransformer(BaseTransformer):
         }
 
         if self.is_logger():
-            logger.info(self.proc_rank)
             logger.info("***** Eval results *****")
             for key in sorted(results.keys()):
                 logger.info("  %s = %s", key, str(results[key]))
@@ -140,56 +174,6 @@ class NERTransformer(BaseTransformer):
                             )
         return ret
 
-    def load_and_cache_examples(self, labels, pad_token_label_id, mode):
-        args = self.hparams
-        tokenizer = self.tokenizer
-        if self.proc_rank not in [-1, 0] and mode == "train":
-            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        # Load data features from cache or dataset file
-        cached_features_file = os.path.join(
-            args.data_dir,
-            "cached_{}_{}_{}".format(
-                mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
-            ),
-        )
-        if os.path.exists(cached_features_file) and not args.overwrite_cache:
-            logger.info("Loading features from cached file %s", cached_features_file)
-            features = torch.load(cached_features_file)
-        else:
-            logger.info("Creating features from dataset file at %s", args.data_dir)
-            examples = read_examples_from_file(args.data_dir, mode)
-            features = convert_examples_to_features(
-                examples,
-                labels,
-                args.max_seq_length,
-                tokenizer,
-                cls_token_at_end=bool(args.model_type in ["xlnet"]),
-                cls_token=tokenizer.cls_token,
-                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-                sep_token=tokenizer.sep_token,
-                sep_token_extra=bool(args.model_type in ["roberta"]),
-                pad_on_left=bool(args.model_type in ["xlnet"]),
-                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-                pad_token_label_id=pad_token_label_id,
-            )
-            if self.proc_rank in [-1, 0]:
-                logger.info("Saving features into cached file %s", cached_features_file)
-                torch.save(features, cached_features_file)
-
-        if self.proc_rank == 0 and mode == "train":
-            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        # Convert to Tensors and build dataset
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
-
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        return dataset
-
     @staticmethod
     def add_model_specific_args(parser, root_dir):
         # Add NER specific options
diff --git a/examples/ner/transformer_base.py b/examples/ner/transformer_base.py
index fd119821fa..132711e2ff 100644
--- a/examples/ner/transformer_base.py
+++ b/examples/ner/transformer_base.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import random
 
@@ -26,6 +27,9 @@ from transformers import (
 )
 
 
+logger = logging.getLogger(__name__)
+
+
 ALL_MODELS = sum(
     (
         tuple(conf.pretrained_config_archive_map.keys())
@@ -77,20 +81,14 @@ class BaseTransformer(pl.LightningModule):
             cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None,
         )
         self.config, self.tokenizer, self.model = config, tokenizer, model
-        self.proc_rank = -1
 
     def is_logger(self):
-        return self.proc_rank <= 0
+        return self.trainer.proc_rank <= 0
 
     def configure_optimizers(self):
         "Prepare optimizer and schedule (linear warmup and decay)"
-        model = self.model
 
-        t_total = (
-            len(self.train_dataloader())
-            // self.hparams.gradient_accumulation_steps
-            * float(self.hparams.num_train_epochs)
-        )
+        model = self.model
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
@@ -103,18 +101,16 @@ class BaseTransformer(pl.LightningModule):
             },
         ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
-        )
-        self.lr_scheduler = scheduler
+        self.opt = optimizer
         return [optimizer]
 
     def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
-
-        # Step each time.
-        optimizer.step()
-        self.lr_scheduler.step()
+        if self.trainer.use_tpu:
+            xm.optimizer_step(optimizer)
+        else:
+            optimizer.step()
         optimizer.zero_grad()
+        self.lr_scheduler.step()
 
     def get_tqdm_dict(self):
         tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
@@ -127,22 +123,27 @@ class BaseTransformer(pl.LightningModule):
     def test_end(self, outputs):
         return self.validation_end(outputs)
 
-    @pl.data_loader
     def train_dataloader(self):
-        return self.load_dataset("train", self.hparams.train_batch_size)
+        train_batch_size = self.hparams.train_batch_size
+        dataloader = self.load_dataset("train", train_batch_size)
+
+        t_total = (
+            (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams.n_gpu)))
+            // self.hparams.gradient_accumulation_steps
+            * float(self.hparams.num_train_epochs)
+        )
+        scheduler = get_linear_schedule_with_warmup(
+            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
+        )
+        self.lr_scheduler = scheduler
+        return dataloader
 
-    @pl.data_loader
     def val_dataloader(self):
         return self.load_dataset("dev", self.hparams.eval_batch_size)
 
-    @pl.data_loader
     def test_dataloader(self):
         return self.load_dataset("test", self.hparams.eval_batch_size)
 
-    def init_ddp_connection(self, proc_rank, world_size):
-        self.proc_rank = proc_rank
-        super(BaseTransformer, self).init_ddp_connection(proc_rank, world_size)
-
     @staticmethod
     def add_model_specific_args(parser, root_dir):
         parser.add_argument(
@@ -213,6 +214,7 @@ def add_generic_args(parser, root_dir):
     )
 
     parser.add_argument("--n_gpu", type=int, default=1)
+    parser.add_argument("--n_tpu_cores", type=int, default=0)
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
     parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
@@ -252,13 +254,22 @@ def generic_train(model, args):
         accumulate_grad_batches=args.gradient_accumulation_steps,
         gpus=args.n_gpu,
         max_epochs=args.num_train_epochs,
+        early_stop_callback=False,
         gradient_clip_val=args.max_grad_norm,
         checkpoint_callback=checkpoint_callback,
     )
+
     if args.fp16:
         train_params["use_amp"] = args.fp16
         train_params["amp_level"] = args.fp16_opt_level
 
+    if args.n_tpu_cores > 0:
+        global xm
+        import torch_xla.core.xla_model as xm
+
+        train_params["num_tpu_cores"] = args.n_tpu_cores
+        train_params["gpus"] = 0
+
     if args.n_gpu > 1:
         train_params["distributed_backend"] = "ddp"
 

From e36bd94345af6045108a391f9ac7f4dc557548de Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 28 Feb 2020 21:11:08 -0500
Subject: [PATCH 22/80] [ci] Run all tests on (self-hosted) GPU (#3020)

* Create self-hosted.yml

* Update self-hosted.yml

* Update self-hosted.yml

* Update self-hosted.yml

* Update self-hosted.yml

* Update self-hosted.yml

* do not run slow tests, for now

* [ci] For comparison with circleci, let's also run CPU-tests

* [ci] reorganize

* clearer filenames

* [ci] Final tweaks before merging

* rm slow tests on circle ci

* Trigger CI

* On GPU this concurrency was way too high
---
 .circleci/config.yml                 | 16 ---------
 .github/workflows/github-push.yml    | 19 +++++++++++
 .github/workflows/self-push.yml      | 47 ++++++++++++++++++++++++++
 .github/workflows/self-scheduled.yml | 50 ++++++++++++++++++++++++++++
 4 files changed, 116 insertions(+), 16 deletions(-)
 create mode 100644 .github/workflows/github-push.yml
 create mode 100644 .github/workflows/self-push.yml
 create mode 100644 .github/workflows/self-scheduled.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c80430ea68..2ee83e6701 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -14,22 +14,6 @@ jobs:
             - run: sudo pip install codecov pytest-cov
             - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
             - run: codecov
-    run_all_tests_torch_and_tf:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        environment:
-            OMP_NUM_THREADS: 1
-            RUN_SLOW: yes
-            RUN_CUSTOM_TOKENIZERS: yes
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[mecab,sklearn,tf-cpu,torch,testing]
-            - run:
-                command: python -m pytest -n 8 --dist=loadfile -s -v ./tests/
-                no_output_timeout: 4h
 
     run_tests_torch:
         working_directory: ~/transformers
diff --git a/.github/workflows/github-push.yml b/.github/workflows/github-push.yml
new file mode 100644
index 0000000000..59d3dc5158
--- /dev/null
+++ b/.github/workflows/github-push.yml
@@ -0,0 +1,19 @@
+name: GitHub-hosted runner
+
+on: push
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-18.04
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        pip install .[tf,torch,quality]
+
+
+
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
new file mode 100644
index 0000000000..cfbe999699
--- /dev/null
+++ b/.github/workflows/self-push.yml
@@ -0,0 +1,47 @@
+name: Self-hosted runner (push)
+
+on: 
+  push:
+    branches:
+      - master
+  pull_request:
+
+
+jobs:
+  run_tests_torch_and_tf_gpu:
+    runs-on: self-hosted
+    steps:
+    - uses: actions/checkout@v2
+    - name: Python version
+      run: |
+        which python
+        python --version
+        pip --version
+    - name: Current dir
+      run: pwd
+    - run: nvidia-smi
+    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+      run: |
+        python -m venv .env
+        source .env/bin/activate
+        which python
+        python --version
+        pip --version
+    - name: Install dependencies
+      run: |
+        source .env/bin/activate
+        pip install .[sklearn,tf,torch,testing]
+
+    - name: Are GPUs recognized by our DL frameworks
+      run: |
+        source .env/bin/activate
+        python -c "import torch; print(torch.cuda.is_available())"
+        python -c "import tensorflow as tf; print(tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU'))"
+
+    - name: Run all non-slow tests on GPU
+      env:
+        OMP_NUM_THREADS: 1
+        USE_CUDA: yes
+      run: |
+        source .env/bin/activate
+        python -m pytest -n 2 --dist=loadfile -s -v ./tests/
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
new file mode 100644
index 0000000000..7c33d5dfcb
--- /dev/null
+++ b/.github/workflows/self-scheduled.yml
@@ -0,0 +1,50 @@
+name: Self-hosted runner (scheduled)
+
+on:
+  push:
+    branches:
+      - ci_*
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  run_all_tests_torch_and_tf_gpu:
+    runs-on: self-hosted
+    steps:
+    - uses: actions/checkout@v2
+    - name: Python version
+      run: |
+        which python
+        python --version
+        pip --version
+    - name: Current dir
+      run: pwd
+    - run: nvidia-smi
+    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+      run: |
+        python -m venv .env
+        source .env/bin/activate
+        which python
+        python --version
+        pip --version
+    - name: Install dependencies
+      run: |
+        source .env/bin/activate
+        pip install .[sklearn,tf,torch,testing]
+
+    - name: Are GPUs recognized by our DL frameworks
+      run: |
+        source .env/bin/activate
+        python -c "import torch; print(torch.cuda.is_available())"
+        python -c "import tensorflow as tf; print(tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU'))"
+
+    - name: Run all tests on GPU
+      env:
+        OMP_NUM_THREADS: 1
+        RUN_SLOW: yes
+        USE_CUDA: yes
+      run: |
+        source .env/bin/activate
+        python -m pytest -n 1 --dist=loadfile -s -v ./tests/
+        

From d6ef587a10e0d8836376a2314d8aeae36ad63263 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 28 Feb 2020 23:19:17 -0500
Subject: [PATCH 23/80] [ci] Fixup e36bd94345af6045108a391f9ac7f4dc557548de

---
 .circleci/config.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2ee83e6701..ff7c021b6f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -118,13 +118,3 @@ workflows:
             - run_tests_torch
             - run_tests_tf
             - deploy_doc: *workflow_filters
-    run_slow_tests:
-        triggers:
-            - schedule:
-                cron: "0 4 * * *"
-                filters:
-                    branches:
-                        only:
-                            - master
-        jobs:
-            - run_all_tests_torch_and_tf

From b5509abb3682608cd564655e593e23d432f2c871 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sun, 1 Mar 2020 01:39:24 +0000
Subject: [PATCH 24/80] --do_lower_case will always trick me...

---
 examples/distillation/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 930edf94fd..0df41ff63e 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -17,7 +17,7 @@ This folder contains the original code used to train Distil* as well as examples
 
 ## What is Distil*
 
-Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 99% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 
 We have applied the same method to other Transformer architectures and released the weights:
 - GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
@@ -31,8 +31,8 @@ Here are the results on the dev sets of GLUE:
 
 | Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
 | :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
-| BERT-base-uncased         |  **74.9**                      | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1              |
-| DistilBERT-base-uncased   |  **74.3**                      | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3              |
+| BERT-base-uncased         |  **79.5**                      | 56.3 | 84.7 | 88.6 | 91.8 | 89.6 | 69.3 | 92.7 | 89.0 | 53.5              |
+| DistilBERT-base-uncased   |  **77.0**                      | 51.3 | 82.1 | 87.5 | 89.2 | 88.5 | 59.9 | 91.3 | 86.9 | 56.3              |
 | BERT-base-cased           |  **78.2**                      | 58.2 | 83.9 | 87.8 | 91.0 | 89.2 | 66.1 | 91.7 | 89.2 | 46.5              |
 | DistilBERT-base-cased     |  **75.9**                      | 47.2 | 81.5 | 85.6 | 88.2 | 87.8 | 60.6 | 90.4 | 85.5 | 56.3              |
 | ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
@@ -65,7 +65,7 @@ This part of the library has only be tested with Python3.6+. There are few speci
 Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
-- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 79.8 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 82.3 F1 score).
+- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
 - `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters.
 - `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
 - `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).

From 852e032ca6505f8ddd9881a7ed67ea0dd9fc7603 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sun, 1 Mar 2020 01:56:50 +0000
Subject: [PATCH 25/80] include roberta in run_squad_w_distillation - cc
 @graviraja

---
 examples/distillation/run_squad_w_distillation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index 4900f19ead..3bbfaf482d 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -45,6 +45,9 @@ from transformers import (
     XLNetConfig,
     XLNetForQuestionAnswering,
     XLNetTokenizer,
+    RobertaConfig,
+    RobertaForQuestionAnswering,
+    RobertaTokenizer,
     get_linear_schedule_with_warmup,
     squad_convert_examples_to_features,
 )
@@ -73,6 +76,7 @@ MODEL_CLASSES = {
     "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
     "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
     "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
 }
 
 

From 298bed16a841fae3608d334441ccae4d9043611f Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Sun, 1 Mar 2020 14:08:01 -0500
Subject: [PATCH 26/80] make style

---
 examples/distillation/run_squad_w_distillation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index 3bbfaf482d..c32af61bcb 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -39,15 +39,15 @@ from transformers import (
     DistilBertConfig,
     DistilBertForQuestionAnswering,
     DistilBertTokenizer,
+    RobertaConfig,
+    RobertaForQuestionAnswering,
+    RobertaTokenizer,
     XLMConfig,
     XLMForQuestionAnswering,
     XLMTokenizer,
     XLNetConfig,
     XLNetForQuestionAnswering,
     XLNetTokenizer,
-    RobertaConfig,
-    RobertaForQuestionAnswering,
-    RobertaTokenizer,
     get_linear_schedule_with_warmup,
     squad_convert_examples_to_features,
 )

From 6b1ff250842f52136d5159bb67a26b50ba01485d Mon Sep 17 00:00:00 2001
From: Victor SANH <victorsanh@gmail.com>
Date: Mon, 2 Mar 2020 10:20:21 -0500
Subject: [PATCH 27/80] fix n_gpu count when no_cuda flag is activated (#3077)

* fix n_gpu count when no_cuda flag is activated

* someone was left behind
---
 examples/contrib/run_swag.py                      | 2 +-
 examples/distillation/run_squad_w_distillation.py | 2 +-
 examples/hans/test_hans.py                        | 2 +-
 examples/mm-imdb/run_mmimdb.py                    | 2 +-
 examples/ner/run_ner.py                           | 2 +-
 examples/run_bertology.py                         | 2 +-
 examples/run_generation.py                        | 2 +-
 examples/run_glue.py                              | 2 +-
 examples/run_language_modeling.py                 | 2 +-
 examples/run_multiple_choice.py                   | 2 +-
 examples/run_squad.py                             | 2 +-
 examples/run_xnli.py                              | 2 +-
 templates/adding_a_new_example_script/run_xxx.py  | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 497ddeca9d..96a16d8df5 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -622,7 +622,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index c32af61bcb..3fabe23ba2 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -720,7 +720,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/hans/test_hans.py b/examples/hans/test_hans.py
index 40c2a1bd3a..a5d4e76149 100644
--- a/examples/hans/test_hans.py
+++ b/examples/hans/test_hans.py
@@ -520,7 +520,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py
index c7e9f7b47e..ab60b40951 100644
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -492,7 +492,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index 73b43aee97..c32b3af226 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -557,7 +557,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index acac56128a..d18b8bc3a2 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -338,7 +338,7 @@ def main():
     # Setup devices and distributed training
     if args.local_rank == -1 or args.no_cuda:
         args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:
         torch.cuda.set_device(args.local_rank)
         args.device = torch.device("cuda", args.local_rank)
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 0652567b6b..3f90ee5833 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -189,7 +189,7 @@ def main():
     args = parser.parse_args()
 
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = torch.cuda.device_count()
+    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
 
     set_seed(args)
 
diff --git a/examples/run_glue.py b/examples/run_glue.py
index f3c31b0c06..f5bbde9031 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -575,7 +575,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/run_language_modeling.py b/examples/run_language_modeling.py
index e0a8929927..c66cc8978f 100644
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -663,7 +663,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 72337c110f..c4f90bbad7 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -535,7 +535,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/run_squad.py b/examples/run_squad.py
index f94fb22098..523093e1bb 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -725,7 +725,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index 0b9e559f1b..9dcae8568f 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -530,7 +530,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index 6de065ce65..20f4b7360b 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -594,7 +594,7 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)

From b54ef78d0c30045bb3f9ecc8b178eab0dfdbeaec Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Mon, 2 Mar 2020 10:35:53 -0500
Subject: [PATCH 28/80] Bart-CNN (#3059)

`generate` code that produces 99% identical summarizations to fairseq on CNN test data, with caching.
---
 docs/source/model_doc/bart.rst                |  23 +-
 docs/source/pretrained_models.rst             |   5 +-
 src/transformers/configuration_bart.py        |  13 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  56 ++-
 src/transformers/modeling_bart.py             | 450 +++++++++++++++---
 src/transformers/modeling_utils.py            |  81 ++--
 src/transformers/tokenization_bart.py         |   6 +-
 tests/test_modeling_bart.py                   |  62 ++-
 8 files changed, 544 insertions(+), 152 deletions(-)

diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst
index a034f3b57a..c618b97df6 100644
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -4,20 +4,27 @@ Bart
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
 @sshleifer
 
-The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, Luke Zettlemoyer on 29 Oct, 2019.
-It is a sequence to sequence model where both encoder and decoder are transformers. The paper also introduces a novel pretraining objective, and demonstrates excellent summarization results.
-The authors released their code `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_
+Paper
+~~~~~
+The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+According to the abstract:
 
-**Abstract:**
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE.
 
-*We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and many other more recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of the original sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also report ablation experiments that replicate other pretraining schemes within the BART framework, to better measure which factors most influence end-task performance.*
-`BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension`
+The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_
 
 
-Notes:
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
 - Bart doesn't use :obj:`token_type_ids`, for sequence classification just use BartTokenizer.encode to get the proper splitting.
 - Inputs to the decoder are created by BartModel.forward if they are not passed. This is different than some other model APIs.
 - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to fairseq.encode starts with a space.
+- Decoder inputs are created automatically by the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``
+BartModel
+- ``MaskedLM.generate`` should be used for summarization, see the example in that docstrings
+
 
 BartModel
 ~~~~~~~~~~~~~~~~~~~~
@@ -30,7 +37,7 @@ BartForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.BartForMaskedLM
-    :members: forward
+    :members: forward, generate
 
 
 BartForSequenceClassification
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 4120f88dc1..565c861ccd 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -280,7 +280,10 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bart-large-mnli``                                        | | Adds a 2 layer classification head with 1 million parameters                                                                        |
-|                   |                                                            | | bart-large base architecture with a classification head                                                                             |
+|                   |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bart-large-cnn``                                         | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
+|                   |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 
diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
index 2e096c5501..7eb3bd7fe8 100644
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -26,7 +26,7 @@ _bart_large_url = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/
 BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "bart-large": _bart_large_url,
     "bart-large-mnli": _bart_large_url,  # fine as same
-    "bart-cnn": None,  # not done
+    "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json",
 }
 
 
@@ -59,6 +59,7 @@ class BartConfig(PretrainedConfig):
         classifier_dropout=0.0,
         output_past=False,
         num_labels=3,
+        bos_token_id=0,
         **common_kwargs
     ):
         r"""
@@ -67,12 +68,16 @@ class BartConfig(PretrainedConfig):
                 config = BartConfig.from_pretrained('bart-large')
                 model = BartModel(config)
         """
-        super().__init__(num_labels=num_labels, output_past=output_past, pad_token_id=pad_token_id, **common_kwargs)
-
+        super().__init__(
+            num_labels=num_labels,
+            output_past=output_past,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            **common_kwargs,
+        )
         self.vocab_size = vocab_size
         self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
         self.eos_token_id = eos_token_id
-
         self.encoder_ffn_dim = encoder_ffn_dim
         self.encoder_layers = self.num_hidden_layers = encoder_layers
         self.encoder_attention_heads = encoder_attention_heads
diff --git a/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py
index 6a9403aea4..27e4b974f0 100644
--- a/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py
@@ -23,9 +23,11 @@ import fairseq
 import torch
 from packaging import version
 
-from transformers import BartConfig, BartForSequenceClassification, BartModel, BartTokenizer
+from transformers import BartConfig, BartForMaskedLM, BartForSequenceClassification, BartModel, BartTokenizer
 
 
+FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn"]
+
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
     raise Exception("requires fairseq >= 0.9.0")
 
@@ -33,7 +35,7 @@ if version.parse(fairseq.__version__) < version.parse("0.9.0"):
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-SAMPLE_TEXT = "Hello world! cécé herlolip"
+SAMPLE_TEXT = " Hello world! cécé herlolip"
 
 rename_keys = [
     ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
@@ -41,7 +43,7 @@ rename_keys = [
     ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
     ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
 ]
-IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version"]
+IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor"]
 
 
 def rename_key(dct, old, new):
@@ -53,36 +55,45 @@ def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
     """
     Copy/paste/tweak model's weights to our BERT structure.
     """
-    b2 = torch.hub.load("pytorch/fairseq", checkpoint_path)
-    b2.eval()  # disable dropout
-    b2.model.upgrade_state_dict(b2.model.state_dict())
-    config = BartConfig()
-    tokens = b2.encode(SAMPLE_TEXT).unsqueeze(0)
-    tokens2 = BartTokenizer.from_pretrained("bart-large").encode(SAMPLE_TEXT).unsqueeze(0)
+    bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
+    bart.eval()  # disable dropout
+    bart.model.upgrade_state_dict(bart.model.state_dict())
+    hf_model_name = checkpoint_path.replace(".", "-")
+    config = BartConfig.from_pretrained(hf_model_name)
+    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
+    tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
     assert torch.eq(tokens, tokens2).all()
 
-    # assert their_output.size() == (1, 11, 1024)
-
-    if checkpoint_path == "bart.large":
-        state_dict = b2.model.state_dict()
+    if checkpoint_path in ["bart.large", "bart.large.cnn"]:
+        state_dict = bart.model.state_dict()
+        for k in IGNORE_KEYS:
+            state_dict.pop(k, None)
         state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
         model = BartModel(config)
-        their_output = b2.extract_features(tokens)
-
+        their_output = bart.extract_features(tokens)
     else:  # MNLI Case
-        state_dict = b2.state_dict()
+        state_dict = bart.state_dict()
+        for k in IGNORE_KEYS:
+            state_dict.pop(k, None)
         state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
         for src, dest in rename_keys:
             rename_key(state_dict, src, dest)
-        state_dict.pop("_float_tensor", None)
         model = BartForSequenceClassification(config)
-        their_output = b2.predict("mnli", tokens, return_logits=True)
-    for k in IGNORE_KEYS:
-        state_dict.pop(k, None)
+        their_output = bart.predict("mnli", tokens, return_logits=True)
+
+    # Load state dict
     model.load_state_dict(state_dict)
     model.eval()
-    our_outputs = model.forward(tokens)[0]
+    # Check results
 
+    if checkpoint_path == "bart.large.cnn":  # generate doesnt work yet
+        model = BartForMaskedLM(config, base_model=model)
+        assert "lm_head.weight" in model.state_dict()
+        assert model.lm_head.out_features == config.max_position_embeddings
+        model.eval()
+        our_outputs = model.model.forward(tokens)[0]
+    else:
+        our_outputs = model.forward(tokens)[0]
     assert their_output.shape == our_outputs.shape
     assert (their_output == our_outputs).all().item()
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
@@ -92,7 +103,8 @@ def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # Required parameters
-    parser.add_argument("fairseq_path", choices=["bart.large", "bart.large.mnli"], type=str, help="")
+    parser.add_argument("fairseq_path", choices=FAIRSEQ_MODELS, type=str, help="")
+
     parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
     args = parser.parse_args()
     convert_bart_checkpoint(
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index b0188d4d88..f832d88575 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch BART model, ported from the fairseq repo."""
-
 import logging
+import math
 import random
 from typing import Dict, List, Optional, Tuple
 
@@ -24,7 +24,7 @@ from torch import Tensor, nn
 
 from .configuration_bart import BartConfig
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import PreTrainedModel, create_position_ids_from_input_ids
+from .modeling_utils import BeamHypotheses, PreTrainedModel, create_position_ids_from_input_ids
 
 
 logger = logging.getLogger(__name__)
@@ -33,6 +33,7 @@ logger = logging.getLogger(__name__)
 BART_PRETRAINED_MODEL_ARCHIVE_MAP = {
     "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/pytorch_model.bin",
     "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/pytorch_model.bin",
+    "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/pytorch_model.bin",
 }
 
 BART_START_DOCSTRING = r"""
@@ -332,7 +333,7 @@ class DecoderLayer(nn.Module):
         x,
         encoder_hidden_states,
         encoder_attn_mask=None,
-        decoder_cached_states=None,
+        layer_state=None,
         attention_mask=None,
         need_attn_weights=False,
     ):
@@ -348,43 +349,28 @@ class DecoderLayer(nn.Module):
         Returns:
             encoded output of shape `(seq_len, batch, embed_dim)`
         """
-        if decoder_cached_states is None:
-            prev_self_attn_state, prev_attn_state = (None, None)
-        else:
-            assert len(decoder_cached_states) == 3
-            prev_self_attn_state, prev_attn_state = (
-                decoder_cached_states["self"],
-                decoder_cached_states["encoder_decoder"],
-            )
 
         residual = x
-        if prev_self_attn_state is not None:
-            saved_state = prev_self_attn_state
-            decoder_cached_states["self"] = saved_state
         y = x  # TODO(SS): figure out why fairseq did this, then hopefully delete it
 
+        if layer_state is None:
+            layer_state = {}
+        # next line mutates layer state
         x, self_attn_weights = self.self_attn.forward(
-            query=x,
-            key=y,
-            value=y,
-            decoder_cached_states=decoder_cached_states,
-            need_weights=need_attn_weights,
-            attn_mask=attention_mask,
+            query=x, key=y, value=y, layer_state=layer_state, need_weights=need_attn_weights, attn_mask=attention_mask,
         )
         x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.self_attn_layer_norm(x)
         residual = x
         assert self.encoder_attn.cache_key != self.self_attn.cache_key
-        if prev_attn_state is not None:
-            saved_state = prev_attn_state
-            decoder_cached_states["encoder_decoder"] = saved_state
+
         x, encoder_attn_weights = self.encoder_attn.forward(
             query=x,
             key=encoder_hidden_states,  # could be None
             value=encoder_hidden_states,
             key_padding_mask=encoder_attn_mask,
-            decoder_cached_states=decoder_cached_states,
+            layer_state=layer_state,  # mutates layer state
             static_kv=True,
             need_weights=False,  # not returning it so why compute it
         )
@@ -403,15 +389,8 @@ class DecoderLayer(nn.Module):
         return (
             x,
             self_attn_weights,
-            decoder_cached_states,
-        )  # just self_attn weights for now, following t5, decoder_cached_states = cache for decoding
-
-    def _past_to_dict(self, prev_attn_state):
-        prev_key, prev_value = prev_attn_state[:2]
-        saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-        if len(prev_attn_state) >= 3:
-            saved_state["prev_key_padding_mask"] = prev_attn_state[2]
-        return saved_state
+            layer_state,
+        )  # just self_attn weights for now, following t5, layer_state = cache for decoding
 
 
 class BartDecoder(nn.Module):
@@ -440,6 +419,7 @@ class BartDecoder(nn.Module):
             [DecoderLayer(config) for _ in range(config.decoder_layers)]
         )  # type: List[DecoderLayer]
         self.layernorm_embedding = LayerNorm(config.d_model)
+        self.generation_mode = False
 
     def forward(
         self,
@@ -469,11 +449,15 @@ class BartDecoder(nn.Module):
                 - attentions
         """
         # embed positions
-        positions = self.embed_positions(input_ids)
-        x = self.embed_tokens(input_ids)
+        positions = self.embed_positions.forward(input_ids, generation_mode=self.generation_mode)
 
-        if positions is not None:
-            x += positions
+        if self.generation_mode:
+            input_ids = input_ids[:, -1:]
+            positions = positions[:, -1:]  # happens after we embed them
+            assert input_ids.ne(self.padding_idx).any()
+
+        x = self.embed_tokens(input_ids)
+        x += positions
 
         x = self.layernorm_embedding(x)
         x = F.dropout(x, p=self.dropout, training=self.training)
@@ -489,17 +473,19 @@ class BartDecoder(nn.Module):
             dropout_probability = random.uniform(0, 1)
             if self.training and (dropout_probability < self.layerdrop):
                 continue
+
             layer_state = decoder_cached_states[i] if decoder_cached_states is not None else None
             x, layer_self_attn, layer_past = decoder_layer.forward(
                 x,
                 encoder_hidden_states,
                 encoder_padding_mask,
-                decoder_cached_states=layer_state,
+                layer_state=layer_state,
                 attention_mask=combined_mask,
                 need_attn_weights=self.output_attentions,
             )
+
             if self.output_past:
-                next_decoder_cache.append(layer_past)
+                next_decoder_cache.append(layer_past.copy())
             if self.output_hidden_states:
                 all_hidden_states += (x,)
             if self.output_attentions:
@@ -509,7 +495,22 @@ class BartDecoder(nn.Module):
         all_hidden_states = [hidden_state.transpose(0, 1) for hidden_state in all_hidden_states]
         x = x.transpose(0, 1)
 
-        return x, next_decoder_cache, all_hidden_states, list(all_self_attns)
+        if self.output_past:
+            next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache)
+        else:
+            next_cache = None
+        return x, next_cache, all_hidden_states, list(all_self_attns)
+
+
+def reorder_attn_buffer(input_buffer, new_order):
+    """Reorder buffered internal state (for incremental generation)."""
+    # input_buffer = self._get_input_buffer(incremental_state)
+    for k in input_buffer.keys():
+        input_buffer_k = input_buffer[k]
+        if input_buffer_k is not None:
+            input_buffer[k] = input_buffer_k.index_select(0, new_order)
+        # incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+    return input_buffer
 
 
 class SelfAttention(nn.Module):
@@ -557,7 +558,7 @@ class SelfAttention(nn.Module):
         key: Optional[Tensor],
         value: Optional[Tensor],
         key_padding_mask: Optional[Tensor] = None,
-        decoder_cached_states: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        layer_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
         need_weights: bool = False,
         static_kv: bool = False,
         attn_mask: Optional[Tensor] = None,
@@ -579,8 +580,8 @@ class SelfAttention(nn.Module):
         assert embed_dim == self.embed_dim
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
         # get here for encoder decoder cause of static_kv
-        if decoder_cached_states is not None:  # get the last k,v and mask for reuse
-            saved_state = decoder_cached_states.get(self.cache_key, {})
+        if layer_state is not None:  # get the last k,v and mask for reuse
+            saved_state = layer_state.get(self.cache_key, {})
             if "prev_key" in saved_state:
                 # previous time steps are cached - no need to recompute key and value if they are static
                 if static_kv:
@@ -588,6 +589,7 @@ class SelfAttention(nn.Module):
                     key = value = None
         else:
             saved_state = None
+            layer_state = {}
 
         q = self.q_proj(query) * self.scaling
         if self.encoder_decoder_attention:
@@ -608,17 +610,16 @@ class SelfAttention(nn.Module):
             v = self._shape(v, -1, bsz)
 
         if saved_state is not None:
-            k, v, key_padding_mask, new_state = self._use_and_update_saved_state(
-                k, v, saved_state, key_padding_mask, static_kv, bsz
-            )
-            saved_state.update(
-                {
-                    "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
-                    "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
-                    "prev_key_padding_mask": key_padding_mask,
-                }
-            )
-            decoder_cached_states[self.cache_key] = saved_state  # Update cache
+            k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz)
+        # assert self.cache_key != 'encoder_decoder' or key_padding_mask is None
+
+        # Update cache
+        layer_state[self.cache_key] = {
+            "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
+            "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
+            "prev_key_padding_mask": key_padding_mask if not static_kv else None,
+        }
+
         assert k is not None
         src_len = k.size(1)
         attn_weights = torch.bmm(q, k.transpose(1, 2))
@@ -632,7 +633,7 @@ class SelfAttention(nn.Module):
         # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
         if key_padding_mask is not None and key_padding_mask.dim() == 0:
             key_padding_mask = None
-        assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len)
+        assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,)
 
         if key_padding_mask is not None:  # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
@@ -650,7 +651,7 @@ class SelfAttention(nn.Module):
         attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
         return attn_output, attn_weights
 
-    def _use_and_update_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
+    def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
         # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
         if "prev_key" in saved_state:
             _prev_key = saved_state["prev_key"]
@@ -675,7 +676,7 @@ class SelfAttention(nn.Module):
         key_padding_mask = self._cat_prev_key_padding_mask(
             key_padding_mask, prev_key_padding_mask, bsz, k.size(1), static_kv
         )
-        return k, v, key_padding_mask, saved_state
+        return k, v, key_padding_mask
 
     @staticmethod
     def _cat_prev_key_padding_mask(
@@ -693,7 +694,6 @@ class SelfAttention(nn.Module):
         # During incremental decoding, as the padding token enters and
         # leaves the frame, there will be a time when prev or current is None
         elif prev_key_padding_mask is not None:
-
             filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1))
             if prev_key_padding_mask.is_cuda:
                 filler = filler.cuda()
@@ -747,9 +747,13 @@ class LearnedPositionalEmbedding(nn.Embedding):
         num_embeddings += padding_idx + 1  # WHY?
         super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
 
-    def forward(self, input):
+    def forward(self, input, generation_mode=False):
         """Input is expected to be of size [bsz x seqlen]."""
-        positions = create_position_ids_from_input_ids(input, self.padding_idx)
+        if generation_mode:  # the position is our current step in the decoded sequence
+            pos = int(self.padding_idx + input.size(1))
+            positions = input.data.new(1, 1).fill_(pos)
+        else:
+            positions = create_position_ids_from_input_ids(input, self.padding_idx)
         return super().forward(positions)
 
 
@@ -826,21 +830,20 @@ class BartModel(PretrainedBartModel):
             assert attention_mask.max() <= 0
 
         # make masks if user doesn't supply
-        decoder_input_ids, decoder_attn_mask = _prepare_bart_decoder_inputs(
-            self.config, input_ids, decoder_input_ids=decoder_input_ids, decoder_attn_mask=decoder_attention_mask,
-        )
-
+        if not self.decoder.generation_mode:
+            decoder_input_ids, decoder_attention_mask = _prepare_bart_decoder_inputs(
+                self.config, input_ids, decoder_input_ids=decoder_input_ids, decoder_attn_mask=decoder_attention_mask,
+            )
         assert decoder_input_ids is not None
         if encoder_outputs is None:
-            # TODO(SS): make this caching more usable when overwrite generate
             encoder_outputs = self.encoder.forward(input_ids=input_ids, attention_mask=attention_mask)
         assert isinstance(encoder_outputs, tuple)
-        # dec_features, decoder_cached_states, dec_hidden, dec_attn
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         decoder_outputs = self.decoder.forward(
             decoder_input_ids,
             encoder_outputs[0],
             attention_mask,
-            decoder_attn_mask,
+            decoder_attention_mask,
             decoder_cached_states=decoder_cached_states,
         )
         # Attention and hidden_states will be [] or None if they aren't needed
@@ -856,20 +859,26 @@ class BartModel(PretrainedBartModel):
         self.shared = value
 
     def get_output_embeddings(self):
-        return _make_linear_from_emb(self.shared)
+        return _make_linear_from_emb(self.shared)  # make it on the fly
 
 
 @add_start_docstrings(
-    "The bare BART Model with a language modeling head", BART_START_DOCSTRING,
+    "The bare BART Model with a language modeling head. This is the model used for summarization.",
+    BART_START_DOCSTRING,
 )
 class BartForMaskedLM(PretrainedBartModel):
     base_model_prefix = "model"
 
     def __init__(self, config: BartConfig):
         super().__init__(config)
-        self.model = BartModel(config)
+        # if base_model is None:
+        base_model = BartModel(config)
+        self.model = base_model
         self.lm_head = _make_linear_from_emb(self.model.shared)
 
+    def tie_weights(self):
+        pass  # hack to prevent changing lm_head.out_features. The input and output embeddings are still the same.
+
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -935,12 +944,309 @@ class BartForMaskedLM(PretrainedBartModel):
         return outputs
 
     @staticmethod
-    def prepare_inputs_for_generation(input_ids, past, **kwargs):
-        return {"input_ids": input_ids, "decoder_cached_states": past, "decoder_input_ids": input_ids[:, -1:]}
+    def prepare_inputs_for_generation(input_ids, past, decoder_input_ids, attention_mask):
+        if past is None:  # first step
+            encoder_outputs, decoder_cached_states = None, None
+        else:
+            encoder_outputs, decoder_cached_states = past
+        return {
+            "input_ids": input_ids,  # ignored after first pass
+            "decoder_cached_states": decoder_cached_states,
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            # "decoder_attention_mask": decoder_attention_mask,
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        ((enc_out, enc_mask), decoder_cached_states) = past
+        reordered_past = []
+        for layer_past in decoder_cached_states:
+            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
+            layer_past_new = {
+                attn_key: reorder_attn_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
+            }
+            # reordered_layer_past = [layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx]
+            # reordered_layer_past = torch.cat(reordered_layer_past, dim=1)
+            reordered_past.append(layer_past_new)
+        new_enc_out = enc_out if enc_out is None else enc_out.index_select(1, beam_idx)
+        new_enc_mask = enc_mask if enc_mask is None else enc_mask.index_select(0, beam_idx)
+
+        past = ((new_enc_out, new_enc_mask), reordered_past)
+        return past
 
     def get_output_embeddings(self):
         return self.lm_head
 
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        max_length=20,
+        num_beams=1,
+        repetition_penalty=1.0,
+        length_penalty=1.0,
+        num_return_sequences=1,
+        min_len=0,
+        no_repeat_ngram_size=0,
+    ):
+        r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
+        and beam-search.
+
+        Adapted in part from Facebook's `XLM beam search code`_ and `Fairseq beam search code`_.
+
+        .. _`XLM beam search code`:
+           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
+        .. _`Fairseq beam search code`:
+           https://github.com/pytorch/fairseq/blob/master/fairseq/sequence_generator.py
+
+
+        Parameters:
+
+            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
+                The sequence used as a prompt for the generation. If `None` the method initializes
+                it as an empty `torch.LongTensor` of shape `(1,)`.
+
+            max_length: (`optional`) int
+                The max length of the sequence to be generated. Does not include tokens in input_ids.
+
+            num_beams: (`optional`) int
+                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
+
+            repetition_penalty: (`optional`) float
+                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
+
+            length_penalty: (`optional`) float
+                Exponential penalty to the length. Default to 1.
+
+            num_return_sequences: (`optional`) int
+                The number of independently computed returned sequences for each element in the batch. Default to 1.
+
+            min_len: (`optional`) int
+
+        Returns:
+            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
+                sequence_length is <= max_length (examples can finish early)
+
+        Examples::
+
+            config = BartConfig(vocab_size=50264, output_past=True)
+            model = AutoModelWithLMHead.from_pretrained('bart-large-cnn', config=config)
+            tokenizer = AutoTokenizer.from_pretrained('bart-large-cnn')
+            ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+            inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+            # Generate Summary
+            generated_ids = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], num_beams=4, max_length=5)
+            print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids])
+
+        """
+        bos_token_id = self.config.bos_token_id
+        pad_token_id = self.config.pad_token_id
+        eos_token_id = self.config.eos_token_id
+        batch_size, cur_len = input_ids.shape
+        assert input_ids is not None
+        assert self.config.output_past, "Generating with bart requires instantiating a config with output_past=True"
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
+        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+        assert isinstance(pad_token_id, int)
+        assert bos_token_id == 0, "configurable bos_token_id not yet supported"
+        assert length_penalty > 0, "`length_penalty` should be strictly positive."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a positive integer."
+
+        # current position and vocab size
+        cur_len = input_ids.shape[1]
+        vocab_size = self.config.vocab_size
+
+        if num_return_sequences != 1:
+            # Expand input to num return sequences
+            input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
+            input_ids = input_ids.contiguous().view(
+                batch_size * num_return_sequences, cur_len
+            )  # shape: (batch_size * num_return_sequences, cur_len)
+            batch_size *= num_return_sequences
+
+        # Below here somewhat similar to PretrainedModel._generate_beam_search
+        # Expand input to num beams
+        input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
+
+        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
+        if attention_mask is not None:
+            attention_mask = (
+                attention_mask.unsqueeze(1)
+                .expand(batch_size, num_beams, cur_len)
+                .contiguous()
+                .view(batch_size * num_beams, cur_len)
+            )  # RESHAPE
+
+        # generated hypotheses
+        finalized_hyps = [  # they end in EOS and we wont work on them more!
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=True) for _ in range(batch_size)
+        ]
+
+        # scores for each sentence in the beam
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9  # avoid ties in first step
+        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
+
+        # decoder tokens
+        prev_output_tokens = input_ids.new(batch_size * num_beams, 1).long().fill_(-1)
+        prev_output_tokens[:, 0] = 2  # HARDCODED EOS, which will be removed at the end.
+        decoder_cache = None
+        done = [False for _ in range(batch_size)]  # done sentences
+
+        self.model.decoder.generation_mode = True  # tells decoder not to use causal mask
+        for step in range(max_length + 1):
+            decoder_input_ids = prev_output_tokens.clone()
+            model_inputs = self.prepare_inputs_for_generation(
+                input_ids, decoder_cache, decoder_input_ids, attention_mask,
+            )
+            outputs = self(**model_inputs)
+            lprobs = F.log_softmax(outputs[0][:, -1, :], dim=-1)
+
+            lprobs[lprobs != lprobs] = -math.inf  # block nans
+            lprobs[:, pad_token_id] = -math.inf
+            # TODO(SS): fairseq also takes out <unk> every step, and has unk at slot 3
+
+            if step == 0:  # Force BOS to be chosen
+                lprobs[:, bos_token_id + 1 :] = -math.inf
+            elif step < min_len:  # Prevent EOS from being chosen
+                lprobs[:, eos_token_id] = -math.inf
+            elif step == max_length:  # FORCE EOS to be chosen
+                lprobs[:, :eos_token_id] = -math.inf
+                lprobs[:, eos_token_id + 1 :] = -math.inf
+            assert self._do_output_past(outputs)
+            decoder_cache = outputs[1]
+            if repetition_penalty != 1.0:
+                self.enforce_repetition_penalty_(lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty)
+            num_hypos = batch_size * num_beams
+            if no_repeat_ngram_size > 0:  # copied from fairseq
+                # for each sentence, calculate a list of banned tokens to prevent repetitively generating the same ngrams
+                banned_tokens = self.calc_banned_tokens(prev_output_tokens, num_hypos, no_repeat_ngram_size, step)
+                # then set their probabilities tof -inf
+                for idx in range(num_hypos):
+                    lprobs[idx, banned_tokens[idx]] = -math.inf
+            assert lprobs.size() == (batch_size * num_beams, vocab_size)
+            _scores = lprobs + beam_scores[:, None].expand_as(lprobs)  # (batch_size * num_beams, vocab_size)
+
+            # re-organize to group the beam together (we are keeping top hypothesis across beams)
+            _scores = _scores.view(batch_size, num_beams * vocab_size)  # (batch_size, num_beams * vocab_size)
+            # Take the best 2 x beam_size predictions for each example, we'll choose the first beam_size of these which don't predict eos to continue with.
+            next_scores, next_words = torch.topk(_scores, 2 * num_beams)
+            assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
+
+            # list of (batch_size * num_beams)
+            next_batch_beam = []  # Tuple(next score, next word, current position in the batch)
+            for batch_idx in range(batch_size):
+                # if we are done with this sentence (because we can't improve)
+                if done[batch_idx]:  # then pad all associated hypotheses
+                    assert (
+                        len(finalized_hyps[batch_idx]) >= num_beams
+                    ), "Example can only be done if at least {} beams have been generated".format(num_beams)
+                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    continue
+
+                # Otherwise generate some next word choices
+                next_sent_beam = []
+                # add next words for this sentence
+                for i, (idx, score) in enumerate(zip(next_words[batch_idx], next_scores[batch_idx])):
+                    beam_id = idx // vocab_size
+                    word_id = idx % vocab_size
+                    assert prev_output_tokens.shape[1] == (step + 1)
+                    if word_id.item() == eos_token_id:
+                        if i >= num_beams:
+                            continue
+                        finalized_hyps[batch_idx].add(
+                            prev_output_tokens[batch_idx * num_beams + beam_id].clone(), score.item(),
+                        )
+                    else:
+                        next_sent_beam.append((score, word_id, batch_idx * num_beams + beam_id))
+
+                    if len(next_sent_beam) == num_beams:  # TODO(SS): can we delete this?
+                        break
+                # Check if were done so that we can save a pad step if all(done)
+                done[batch_idx] = done[batch_idx] or finalized_hyps[batch_idx].is_done(
+                    next_scores[batch_idx].max().item(), cur_len=step + 1,
+                )
+                assert len(next_sent_beam) == num_beams, "Beam should always be full"
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == num_beams * (batch_idx + 1)
+
+            if all(done):
+                break
+
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * num_beams
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_words = input_ids.new([x[1] for x in next_batch_beam])
+            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
+            # re-order decoder inputs to [beam_idx]
+            prev_output_tokens = prev_output_tokens[beam_idx]
+            prev_output_tokens = torch.cat([prev_output_tokens, beam_words.unsqueeze(1)], dim=-1)
+
+            # re-order internal states
+            decoder_cache = self._reorder_cache(decoder_cache, beam_idx)
+
+        for batch_idx in range(batch_size):
+            # Add all open beam hypothesis to generated_hyps
+            if done[batch_idx]:
+                continue
+            offset = batch_idx * num_beams
+            for i in range(num_beams):
+                score = beam_scores[offset + i]
+                final_tokens = prev_output_tokens[offset + i]
+                finalized_hyps[batch_idx].add(final_tokens, score.item())
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size)
+        best = []
+        for i, hypotheses in enumerate(finalized_hyps):
+            best_hyp = max(hypotheses.beams, key=lambda x: x[0])[1]
+            sent_lengths[i] = len(best_hyp)
+            best.append(best_hyp)
+
+        # shorter batches are filled with pad_token
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            # TODO(SS): decoded = torch.rnn.utils.pad_sequence(best, batch_first=True, padding_value=pad_token_id)
+            sent_max_len = min(sent_lengths.max().item() + 1, max_length + 1)  # TODO(SS): same as step?
+            decoded = input_ids.new(batch_size, sent_max_len).fill_(pad_token_id)
+            # fill with hypothesis and eos_token_id if necessary
+            for i, hypo in enumerate(best):
+                decoded[i, : sent_lengths[i]] = hypo
+                if sent_lengths[i] < max_length:
+                    decoded[i, sent_lengths[i]] = eos_token_id
+        else:
+            assert (len(hypo) == max_length for hypo in best)
+            decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device)
+        return decoded[:, 1:]  # get rid of starting EOS
+
+    @staticmethod
+    def calc_banned_tokens(prev_output_tokens, num_hypos, no_repeat_ngram_size, step):
+        """Copied from fairseq for no_repeat_ngram in beam_search"""
+        # TODO(SS): this can go on parent if there is demand
+        if step + 2 < no_repeat_ngram_size:
+            return [
+                [] for _ in range(num_hypos)
+            ]  # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        gen_ngrams = [{} for _ in range(num_hypos)]
+        for idx in range(num_hypos):
+            gen_tokens = prev_output_tokens[idx].tolist()
+            for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
+                k = tuple(ngram[:-1])
+                gen_ngrams[idx][k] = gen_ngrams[idx].get(k, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            """Before decoding the next token, prevent decoding of ngrams that have already appeared"""
+            ngram_index = tuple(prev_output_tokens[hypo_idx, step + 2 - no_repeat_ngram_size : step + 1].tolist())
+            return gen_ngrams[hypo_idx].get(ngram_index, [])
+
+        banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+        return banned_tokens
+
 
 @add_start_docstrings(
     """Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7c61e7fdc7..efd622656e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -171,7 +171,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         else:
             output_embeddings.weight = input_embeddings.weight
 
-        if hasattr(output_embeddings, "bias") and output_embeddings.bias is not None:
+        if getattr(output_embeddings, "bias", None) is not None:
             output_embeddings.bias.data = torch.nn.functional.pad(
                 output_embeddings.bias.data,
                 (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
@@ -558,7 +558,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                         model.__class__.__name__, "\n\t".join(error_msgs)
                     )
                 )
-
         model.tie_weights()  # make sure word embedding weights are still tied if needed
 
         # Set model in evaluation mode to desactivate DropOut modules by default
@@ -574,16 +573,25 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         return {"input_ids": input_ids}
 
     def _do_output_past(self, outputs):
-        has_output_past = hasattr(self.config, "output_past") and self.config.output_past
-        has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
-
-        if has_output_past and not has_mem_len and len(outputs) > 1:
+        """During generation, decide whether to pass the `past` variable to the next forward pass."""
+        has_output_past = getattr(self.config, "output_past", False)
+        mem_len = getattr(self.config, "mem_len", 0)
+        if len(outputs) <= 1:
+            return False
+        if mem_len > 0 or has_output_past:
             return True
-        elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
-            return True
-
         return False
 
+    def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
+        """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+        for i in range(batch_size * num_beams):
+            for previous_token in set(prev_output_tokens[i].tolist()):
+                # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+                if lprobs[i, previous_token] < 0:
+                    lprobs[i, previous_token] *= repetition_penalty
+                else:
+                    lprobs[i, previous_token] /= repetition_penalty
+
     @torch.no_grad()
     def generate(
         self,
@@ -761,7 +769,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
             input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
             input_ids = input_ids.contiguous().view(
                 batch_size * num_return_sequences, cur_len
-            )  # (batch_size * num_return_sequences, cur_len)
+            )  # shape: (batch_size * num_return_sequences, cur_len)
             effective_batch_size = batch_size * num_return_sequences
         else:
             effective_batch_size = batch_size
@@ -822,9 +830,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         sent_lengths = input_ids.new(batch_size).fill_(max_length)
 
         past = None
-
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
+
             outputs = self(**model_inputs)
             next_token_logits = outputs[0][:, -1, :]
 
@@ -834,13 +842,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
             # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                for i in range(batch_size):
-                    for previous_token in set(input_ids[i].tolist()):
-                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                        if next_token_logits[i, previous_token] < 0:
-                            next_token_logits[i, previous_token] *= repetition_penalty
-                        else:
-                            next_token_logits[i, previous_token] /= repetition_penalty
+                self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty)
 
             if do_sample:
                 # Temperature (higher temperature => more likely to sample low probability tokens)
@@ -911,6 +913,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         """ Generate sequences for each example with beam search.
         """
         # Expand input to num beams
+        # assert input_ids.shape == (batch_size * num_beams, cur_len)
         input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
         input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
 
@@ -941,13 +944,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
             # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                for i in range(batch_size * num_beams):
-                    for previous_token in set(input_ids[i].tolist()):
-                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                        if scores[i, previous_token] < 0:
-                            scores[i, previous_token] *= repetition_penalty
-                        else:
-                            scores[i, previous_token] /= repetition_penalty
+                self.enforce_repetition_penalty_(scores, batch_size, num_beams, input_ids, repetition_penalty)
 
             if do_sample:
                 # Temperature (higher temperature => more likely to sample low probability tokens)
@@ -1039,16 +1036,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
             # re-order internal states
             if past:
-                reordered_past = []
-                for layer_past in past:
-                    # get the correct batch idx from layer past batch dim
-                    # batch dim of `past` and `mems` is at 2nd position
-                    reordered_layer_past = [layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx]
-                    reordered_layer_past = torch.cat(reordered_layer_past, dim=1)
-                    # check that shape matches
-                    assert reordered_layer_past.shape == layer_past.shape
-                    reordered_past.append(reordered_layer_past)
-                past = tuple(reordered_past)
+                past = self._reorder_cache(past, beam_idx)
 
             # update current length
             cur_len = cur_len + 1
@@ -1096,6 +1084,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
         return decoded
 
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = []
+        for layer_past in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` and `mems` is at 2nd position
+            reordered_layer_past = [layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx]
+            reordered_layer_past = torch.cat(reordered_layer_past, dim=1)
+            # check that shape matches
+            assert reordered_layer_past.shape == layer_past.shape
+            reordered_past.append(reordered_layer_past)
+        past = tuple(reordered_past)
+        return past
+
 
 def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
@@ -1164,17 +1166,22 @@ class BeamHypotheses(object):
             else:
                 self.worst_score = min(score, self.worst_score)
 
-    def is_done(self, best_sum_logprobs):
+    def is_done(self, best_sum_logprobs, cur_len=None):
         """
         If there are enough hypotheses and that none of the hypotheses being generated
         can become better than the worst one in the heap, then we are done with this sentence.
         """
+
         if len(self) < self.num_beams:
             return False
         elif self.early_stopping:
             return True
         else:
-            return self.worst_score >= best_sum_logprobs / self.max_length ** self.length_penalty
+            if cur_len is None:
+                cur_len = self.max_length
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
 
 
 class Conv1D(nn.Module):
diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py
index ef2631a352..f5c0d8f1dd 100644
--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -19,11 +19,7 @@ from .tokenization_roberta import RobertaTokenizer
 # vocab and merges same as roberta
 vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
 merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
-_all_bart_models = [
-    "bart-large",
-    "bart-large-mnli",
-    # "bart-large-cnn"
-]
+_all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
 
 
 class BartTokenizer(RobertaTokenizer):
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 29b459fd8d..89e41c79ad 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -240,7 +240,7 @@ class BartHeadTests(unittest.TestCase):
         expected_shape = (*summary.shape, config.vocab_size)
         self.assertEqual(logits.shape, expected_shape)
 
-    def test_generate(self):
+    def test_generate_beam_search(self):
         input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long()
         config = BartConfig(
             vocab_size=self.vocab_size,
@@ -256,8 +256,12 @@ class BartHeadTests(unittest.TestCase):
         )
         lm_model = BartForMaskedLM(config)
         lm_model.eval()
-        new_input_ids = lm_model.generate(input_ids)
-        self.assertEqual(new_input_ids.shape, (input_ids.shape[0], 20))
+
+        new_input_ids = lm_model.generate(
+            input_ids.clone(), num_return_sequences=1, num_beams=2, no_repeat_ngram_size=3, max_length=5
+        )
+        self.assertEqual(new_input_ids.shape, (input_ids.shape[0], 5))
+        # TODO(SS): uneven length batches, empty inputs
 
     def test_shift_tokens_right(self):
         input_ids = torch.Tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]]).long()
@@ -352,3 +356,55 @@ class BartModelIntegrationTest(unittest.TestCase):
         for model_name in list(BART_PRETRAINED_MODEL_ARCHIVE_MAP.keys()):
             model = BartModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
+
+    @slow
+    def test_cnn_summarization_same_as_fairseq(self):
+        hf = BartForMaskedLM.from_pretrained("bart-large-cnn", output_past=True,).to(torch_device)
+        tok = BartTokenizer.from_pretrained("bart-large")
+        text = " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian"
+        tokens = tok.encode(text, return_tensors="pt").to(torch_device)
+        extra_len = 20
+        gen_tokens = hf.generate(tokens, num_beams=4, max_length=extra_len,)  # repetition_penalty=10.,
+        expected_result = "<s>The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday."
+        generated = [tok.decode(g,) for g in gen_tokens]
+        self.assertEqual(expected_result, generated[0])
+
+        # Harder cases with batching
+        FRANCE_ARTICLE = ' Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
+        EXPECTED_SUMMARY_FRANCE = 'French prosecutor says he\'s not aware of any video footage from on board the plane. German daily Bild and French Paris Match claim to have found a cell phone video of the crash. A French Gendarmerie spokesman calls the reports "completely wrong" and "unwarranted" German airline Lufthansa confirms co-pilot Andreas Lubitz had battled depression.'
+
+        SHORTER_ARTICLE = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        EXPECTED_SUMMARY_SHORTER = "The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a move toward greater justice."
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        IRAN_ARTICLE = " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+        EXPECTED_SUMMARY_IRAN = "The U.S. and its negotiating partners reached a very strong framework agreement with Iran. Peter Bergen: The debate that has already begun will likely result in more heat than light. He says the agreement limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Bergen says the most important aim of a nuclear deal is preventing a nuclear Iran."
+
+        ARTICLE_SUBWAY = ' New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+        EXPECTED_SUMMARY_SUBWAY = "Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx. She was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the subway."
+
+        dct = tok.batch_encode_plus(
+            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
+            max_length=1024,
+            pad_to_max_length=True,
+            return_tensors="pt",
+        )
+        self.assertEqual(1024, dct["input_ids"].shape[1])
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=140,
+            min_len=55,
+            no_repeat_ngram_size=3,
+        )
+        decoded = [
+            tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in hypotheses_batch
+        ]
+        self.assertListEqual(
+            [EXPECTED_SUMMARY_FRANCE, EXPECTED_SUMMARY_SHORTER, EXPECTED_SUMMARY_IRAN, EXPECTED_SUMMARY_SUBWAY],
+            decoded,
+        )
+        # TODO(SS): run fairseq again with num_beams=2, min_len=20.
+        # TODO(SS): add test case that hits max_length

From c0135194ebc5de4b1bbef98b31f9c457a0bf746a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 2 Mar 2020 16:53:55 +0100
Subject: [PATCH 29/80] Force pad_token_id to be set before padding for
 standard tokenizer (#3035)

* force pad_token_id to be set before padding

* fix tests and forbid padding without having a padding_token_id set
---
 src/transformers/tokenization_utils.py | 14 ++++++++++-
 tests/test_tokenization_common.py      | 34 ++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index e2ae20c7da..75119e9285 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -1012,6 +1012,12 @@ class PreTrainedTokenizer(object):
                 "https://github.com/huggingface/transformers/pull/2674"
             )
 
+        # Throw an error if we can pad because there is no padding token
+        if pad_to_max_length and self.pad_token_id is None:
+            raise ValueError(
+                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+            )
+
         first_ids = get_input_ids(text)
         second_ids = get_input_ids(text_pair) if text_pair is not None else None
 
@@ -1115,6 +1121,12 @@ class PreTrainedTokenizer(object):
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                 )
 
+        # Throw an error if we can pad because there is no padding token
+        if pad_to_max_length and self.pad_token_id is None:
+            raise ValueError(
+                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+            )
+
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers."
@@ -1788,7 +1800,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
 
         # Throw an error if we can pad because there is no padding token
         if pad_to_max_length and self.pad_token_id is None:
-            raise ValueError("Unable to set proper padding strategy as the tokenizer does have padding token")
+            raise ValueError("Unable to set proper padding strategy as the tokenizer does not have a padding token")
 
         # Set the truncation and padding strategy and restore the initial configuration
         with truncate_and_pad(
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 1ca830004b..b1f69fbfc1 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -449,6 +449,10 @@ class TokenizerTesterMixin:
 
         sequence = "Sequence"
         padding_size = 10
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequence)
+
         padding_idx = tokenizer.pad_token_id
 
         # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
@@ -490,6 +494,10 @@ class TokenizerTesterMixin:
         tokenizer = self.get_tokenizer()
 
         sequence = "Sequence"
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequence)
+
         padding_size = 10
         padding_idx = tokenizer.pad_token_id
         token_type_padding_idx = tokenizer.pad_token_type_id
@@ -503,6 +511,7 @@ class TokenizerTesterMixin:
 
         # Test right padding
         tokenizer.padding_side = "right"
+
         padded_sequence = tokenizer.encode_plus(
             sequence,
             max_length=sequence_length + padding_size,
@@ -588,10 +597,14 @@ class TokenizerTesterMixin:
 
         maximum_length = len(max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len))
 
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequences)
+
         encoded_sequences_padded = [
             tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=maximum_length)
             for sequence in sequences
         ]
+
         encoded_sequences_batch_padded = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True)
         self.assertListEqual(
             encoded_sequences_padded,
@@ -610,6 +623,10 @@ class TokenizerTesterMixin:
         ]
 
         max_length = 100
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequences)
+
         encoded_sequences = [
             tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=max_length) for sequence in sequences
         ]
@@ -620,6 +637,7 @@ class TokenizerTesterMixin:
 
         # Left padding tests
         tokenizer = self.get_tokenizer()
+
         tokenizer.padding_side = "left"
         sequences = [
             "Testing batch encode plus",
@@ -628,6 +646,10 @@ class TokenizerTesterMixin:
         ]
 
         max_length = 100
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, sequences)
+
         encoded_sequences = [
             tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=max_length) for sequence in sequences
         ]
@@ -668,3 +690,15 @@ class TokenizerTesterMixin:
                 encoded_value = encoded_sequences[key]
 
                 self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
+    def _check_no_pad_token_padding(self, tokenizer, sequences):
+        # if tokenizer does not have pad_token_id, an error should be thrown
+        if tokenizer.pad_token_id is None:
+            with self.assertRaises(ValueError):
+                if isinstance(sequences, list):
+                    tokenizer.batch_encode_plus(sequences, pad_to_max_length=True)
+                else:
+                    tokenizer.encode_plus(sequences, pad_to_max_length=True)
+
+            # add pad_token_id to pass subsequent tests
+            tokenizer.add_special_tokens({"pad_token": "<PAD>"})

From 13afb71208a985960fe8eec8eac0804158c54e85 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 2 Mar 2020 11:56:45 -0500
Subject: [PATCH 30/80] [ci] Ensure that TF does not preempt all GPU memory for
 itself

see https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth

Co-Authored-By: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Co-Authored-By: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
---
 .github/workflows/self-push.yml      | 1 +
 .github/workflows/self-scheduled.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index cfbe999699..e21520d0e8 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -40,6 +40,7 @@ jobs:
 
     - name: Run all non-slow tests on GPU
       env:
+        TF_FORCE_GPU_ALLOW_GROWTH: yes
         OMP_NUM_THREADS: 1
         USE_CUDA: yes
       run: |
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 7c33d5dfcb..d963ab5920 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -41,6 +41,7 @@ jobs:
 
     - name: Run all tests on GPU
       env:
+        TF_FORCE_GPU_ALLOW_GROWTH: yes
         OMP_NUM_THREADS: 1
         RUN_SLOW: yes
         USE_CUDA: yes

From 2fdc7f6ce8e15793568645b46e4badf7dbe4ecd8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 2 Mar 2020 18:00:09 +0100
Subject: [PATCH 31/80] correct greedy generation when doing beam search
 (#3078)

* correct greedy generation when doing beam search

* improve comment
---
 src/transformers/modeling_utils.py | 37 +++++++++++++++++++++++++-----
 tests/test_modeling_common.py      | 11 ++++++++-
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index efd622656e..e771fd5cc9 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -754,6 +754,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         else:
             assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
 
+        if do_sample is False:
+            if num_beams == 1:
+                # no_beam_search greedy generation conditions
+                assert (
+                    num_return_sequences == 1
+                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
+
+            else:
+                # beam_search greedy generation conditions
+                assert (
+                    num_beams >= num_return_sequences
+                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
+
         if pad_token_id is None and eos_token_ids is not None:
             logger.warning(
                 "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_ids[0])
@@ -764,7 +777,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         cur_len = input_ids.shape[1]
         vocab_size = self.config.vocab_size
 
-        if num_return_sequences != 1:
+        if num_return_sequences != 1 and do_sample:
             # Expand input to num return sequences
             input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
             input_ids = input_ids.contiguous().view(
@@ -787,6 +800,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 pad_token_id,
                 eos_token_ids,
                 effective_batch_size,
+                num_return_sequences,
                 length_penalty,
                 num_beams,
                 vocab_size,
@@ -826,6 +840,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
             All returned sequence are generated independantly.
         """
         # current position / max lengths / length of generated sentences / unfinished sentences
+
         unfinished_sents = input_ids.new(batch_size).fill_(1)
         sent_lengths = input_ids.new(batch_size).fill_(max_length)
 
@@ -906,12 +921,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         pad_token_id,
         eos_token_ids,
         batch_size,
+        num_return_sequences,
         length_penalty,
         num_beams,
         vocab_size,
     ):
         """ Generate sequences for each example with beam search.
         """
+
         # Expand input to num beams
         # assert input_ids.shape == (batch_size * num_beams, cur_len)
         input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
@@ -1057,20 +1074,28 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                         input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item()
                     )
 
+        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
+        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
+
         # select the best hypotheses
-        sent_lengths = input_ids.new(batch_size)
+        sent_lengths = input_ids.new(output_batch_size)
         best = []
 
+        # retrieve best hypotheses
         for i, hypotheses in enumerate(generated_hyps):
-            best_hyp = max(hypotheses.beams, key=lambda x: x[0])[1]
-            sent_lengths[i] = len(best_hyp)
-            best.append(best_hyp)
+            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
+            for j in range(output_num_return_sequences_per_batch):
+                effective_batch_idx = output_num_return_sequences_per_batch * i + j
+                best_hyp = sorted_hyps.pop()[1]
+                sent_lengths[effective_batch_idx] = len(best_hyp)
+                best.append(best_hyp)
 
         # shorter batches are filled with pad_token
         if sent_lengths.min().item() != sent_lengths.max().item():
             assert pad_token_id is not None, "`Pad_token_id` has to be defined"
             sent_max_len = min(sent_lengths.max().item() + 1, max_length)
-            decoded = input_ids.new(batch_size, sent_max_len).fill_(pad_token_id)
+            decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id)
 
             # fill with hypothesis and eos_token_id if necessary
             for i, hypo in enumerate(best):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4e5202a65a..5277864eca 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -621,10 +621,19 @@ class ModelTesterMixin:
                 # batch_size = 1, num_beams > 1
                 self._check_generated_tokens(model.generate(max_length=5, num_beams=3))
 
+            with self.assertRaises(AssertionError):
+                # generating multiple sequences when greedy no beam generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+
             # batch_size > 1, sample
             self._check_generated_tokens(model.generate(input_ids, num_return_sequences=3))
             # batch_size > 1, greedy
-            self._check_generated_tokens(model.generate(input_ids, do_sample=False, num_return_sequences=3))
+            self._check_generated_tokens(model.generate(input_ids, do_sample=False))
             # batch_size > 1, num_beams > 1, sample
             self._check_generated_tokens(model.generate(input_ids, num_beams=3, num_return_sequences=3,))
             # batch_size > 1, num_beams > 1, greedy

From 0e56b37e805279ecb61670159fa8c71487214e0a Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 2 Mar 2020 12:23:58 -0500
Subject: [PATCH 32/80] rm bogus file

cc @patrickvonplaten
---
 tests/test_modeling_common.py! | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/test_modeling_common.py!

diff --git a/tests/test_modeling_common.py! b/tests/test_modeling_common.py!
deleted file mode 100644
index e69de29bb2..0000000000

From 2c7749784c2581cff57fdfbecfc16fb4a11a45e0 Mon Sep 17 00:00:00 2001
From: Manuel Romero <mrm8488@gmail.com>
Date: Mon, 2 Mar 2020 19:31:58 +0100
Subject: [PATCH 33/80] Update README.md

- Add example of usage
- Update metrics
---
 .../README.md                                 | 56 ++++++++++++-------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
index aa32ca6fde..e50b8cfd0c 100644
--- a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
@@ -19,22 +19,29 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
 | Dev                    | 2.2 K |
 
 
-- [Fine-tune on NER script](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
+- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
 
-```bash
-!export NER_DIR='/content/ner_dataset'
-!python /content/transformers/examples/run_ner.py \
-  --model_type bert \
-  --model_name_or_path dccuchile/bert-base-spanish-wwm-cased \
-  --do_train \
-  --do_eval \
-  --data_dir '/content/ner_dataset' \
-  --num_train_epochs 15.0 \
-  --max_seq_length 384 \
-  --output_dir /content/model_output \
-  --save_steps 5000 \
+- Labels covered:
 
 ```
+B-LOC
+B-MISC
+B-ORG
+B-PER
+I-LOC
+I-MISC
+I-ORG
+I-PER
+O
+```
+
+## Metrics on evaluation set:
+
+|                                                      Metric                                                       |  # score  |
+| :------------------------------------------------------------------------------------: | :-------: |
+| F1                                       | **90.17**  
+| Precision                                | **89.86** | 
+| Recall                                   | **90.47** |    
 
 ## Comparison:
 
@@ -44,13 +51,24 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
 | [bert-spanish-cased-finetuned-ner (this one)](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-ner) | **89.65** |
 |                                              Best Multilingual BERT                                              |   87.38   |
 
-```
- ***** All metrics on Eval results  *****
+## Model in action
 
-f1 = 0.8965040489828165
-loss = 0.11504213575173258
-precision = 0.893679858239811
-recall = 0.8993461462254805
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+nlp_ner = pipeline(
+    "ner",
+    model="mrm8488/bert-spanish-cased-finetuned-ner",
+    tokenizer=(
+        'mrm8488/bert-spanish-cased-finetuned-ner',  
+        {"use_fast": False}
+))
+
+nlp_ner(text)
+
+#Output: [{'entity': 'B-LOC', 'score': 0.9998720288276672, 'word': 'Londres'}]
 ```
 
 > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)

From d3eb7d23a46c1877880e3b791edc63aed3e746a5 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Mon, 2 Mar 2020 14:07:10 -0500
Subject: [PATCH 34/80] Pipeline doc (#3055)

* Pipeline doc initial commit

* pipeline abstraction

* Remove modelcard argument from pipeline

* Task-specific pipelines can be instantiated with no model or tokenizer

* All pipelines doc
---
 docs/source/index.rst                  |   1 +
 docs/source/main_classes/pipelines.rst |  63 +++++
 src/transformers/pipelines.py          | 374 ++++++++++++++++++++++---
 tests/test_pipelines.py                |  38 ++-
 4 files changed, 432 insertions(+), 44 deletions(-)
 create mode 100644 docs/source/main_classes/pipelines.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5c593eacf4..5180ae9af9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -80,6 +80,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     main_classes/configuration
     main_classes/model
     main_classes/tokenizer
+    main_classes/pipelines
     main_classes/optimizer_schedules
     main_classes/processors
 
diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst
new file mode 100644
index 0000000000..9e8bfa8af8
--- /dev/null
+++ b/docs/source/main_classes/pipelines.rst
@@ -0,0 +1,63 @@
+Pipelines
+----------------------------------------------------
+
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most
+of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering.
+
+There are two categories of pipeline abstractions to be aware about:
+
+- The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines
+- The other task-specific pipelines, such as :class:`~transformers.NerPipeline`
+  or :class:`~transformers.QuestionAnsweringPipeline`
+
+The pipeline abstraction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
+other pipeline but requires an additional argument which is the `task`.
+
+.. autoclass:: transformers.pipeline
+    :members:
+
+
+The task specific pipelines
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Parent class: Pipeline
+=========================================
+
+.. autoclass:: transformers.Pipeline
+    :members: predict, transform, save_pretrained
+
+NerPipeline
+==========================================
+
+.. autoclass:: transformers.NerPipeline
+
+TokenClassificationPipeline
+==========================================
+
+This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for
+documentation and usage examples.
+
+FillMaskPipeline
+==========================================
+
+.. autoclass:: transformers.FillMaskPipeline
+
+FeatureExtractionPipeline
+==========================================
+
+.. autoclass:: transformers.FeatureExtractionPipeline
+
+TextClassificationPipeline
+==========================================
+
+.. autoclass:: transformers.TextClassificationPipeline
+
+QuestionAnsweringPipeline
+==========================================
+
+.. autoclass:: transformers.QuestionAnsweringPipeline
+
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 904666ddea..cd7b9ca55d 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -279,6 +279,9 @@ class _ScikitCompat(ABC):
 
 class Pipeline(_ScikitCompat):
     """
+    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
+    different pipelines.
+
     Base class implementing pipelined operations.
     Pipeline workflow is defined as a sequence of the following operations:
         Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
@@ -292,39 +295,49 @@ class Pipeline(_ScikitCompat):
     pickle format.
 
     Arguments:
-        **model**: ``(str, PretrainedModel, TFPretrainedModel)``:
-            Reference to the model to use through this pipeline.
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
 
-        **tokenizer**: ``(str, PreTrainedTokenizer)``:
-            Reference to the tokenizer to use through this pipeline.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
 
-        **args_parser**: ``ArgumentHandler``:
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
             Reference to the object in charge of parsing supplied pipeline parameters.
-
-        **device**: ``int``:
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
             Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
             on the associated CUDA device id.
-
-        **binary_output** ``bool`` (default: False):
+        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
 
     Return:
+        :obj:`List` or :obj:`Dict`:
         Pipeline returns list or dictionary depending on:
-         - Does the user provided multiple sample
-         - The pipeline expose multiple fields in the output object
 
-    Examples:
-        nlp = pipeline('ner')
-        nlp = pipeline('ner', model='...', config='...', tokenizer='...')
-        nlp = NerPipeline(model='...', config='...', tokenizer='...')
-        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
+         - Whether the user supplied multiple samples
+         - Whether the pipeline exposes multiple fields in the output object
     """
 
     default_input_names = None
+    task = None
 
     def __init__(
         self,
-        model,
+        model: Optional = None,
         tokenizer: PreTrainedTokenizer = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
@@ -336,6 +349,8 @@ class Pipeline(_ScikitCompat):
         if framework is None:
             framework = get_framework()
 
+        model, tokenizer = self.get_defaults(model, tokenizer, framework)
+
         self.model = model
         self.tokenizer = tokenizer
         self.modelcard = modelcard
@@ -467,15 +482,74 @@ class Pipeline(_ScikitCompat):
         else:
             return predictions.numpy()
 
+    def get_defaults(self, model, tokenizer, framework):
+        task_defaults = SUPPORTED_TASKS[self.task]
+        if model is None:
+            if framework == "tf":
+                model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
+            elif framework == "pt":
+                model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
+            else:
+                raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")
+
+        if tokenizer is None:
+            default_tokenizer = task_defaults["default"]["tokenizer"]
+            if isinstance(default_tokenizer, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)
+
+        return model, tokenizer
+
 
 class FeatureExtractionPipeline(Pipeline):
     """
-    Feature extraction pipeline using Model head.
+    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
+    which can be used as features in a downstream tasks.
+
+    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+
+    - "feature-extraction", for extracting features of a sequence.
+
+    All models may be used for this pipeline. See a list of all models, including community-contributed models on
+    `huggingface.co/models <https://huggingface.co/models>`__.
+
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
     """
 
+    task = "feature-extraction"
+
     def __init__(
         self,
-        model,
+        model: Optional = None,
         tokenizer: PreTrainedTokenizer = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
@@ -498,9 +572,49 @@ class FeatureExtractionPipeline(Pipeline):
 
 class TextClassificationPipeline(Pipeline):
     """
-    Text classification pipeline using ModelForTextClassification head.
+    Text classification pipeline using ModelForSequenceClassification head. See the
+    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.
+
+    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+
+    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.
+
+    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
+    See the list of available community models fine-tuned on such a task on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=text-classification>`__.
+
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
     """
 
+    task = "sentiment-analysis"
+
     def __call__(self, *args, **kwargs):
         outputs = super().__call__(*args, **kwargs)
         scores = np.exp(outputs) / np.exp(outputs).sum(-1)
@@ -509,12 +623,53 @@ class TextClassificationPipeline(Pipeline):
 
 class FillMaskPipeline(Pipeline):
     """
-    Masked language modeling prediction pipeline using ModelWithLMHead head.
+    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
+    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.
+
+    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+
+    - "fill-mask", for predicting masked tokens in a sequence.
+
+    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
+    which includes the bi-directional models in the library.
+    See the list of available community models on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
+
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
     """
 
+    task = "fill-mask"
+
     def __init__(
         self,
-        model,
+        model: Optional = None,
         tokenizer: PreTrainedTokenizer = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
@@ -574,14 +729,57 @@ class FillMaskPipeline(Pipeline):
 
 class NerPipeline(Pipeline):
     """
-    Named Entity Recognition pipeline using ModelForTokenClassification head.
+    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
+    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.
+
+    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+
+    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.
+
+    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
+    See the list of available community models fine-tuned on such a task on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=token-classification>`__.
+
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
+
+    Example::
+
+        from transformers import pi
     """
 
     default_input_names = "sequences"
+    task = "ner"
 
     def __init__(
         self,
-        model,
+        model: Optional = None,
         tokenizer: PreTrainedTokenizer = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
@@ -716,15 +914,54 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
 
 class QuestionAnsweringPipeline(Pipeline):
     """
-    Question Answering pipeline using ModelForQuestionAnswering head.
+    Question Answering pipeline using ModelForQuestionAnswering head. See the
+    `question answering usage <../usage.html#question-answering>`__ examples for more information.
+
+    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
+    the following task identifier(s):
+
+    - "question-answering", for answering questions given a context.
+
+    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
+    See the list of available community models fine-tuned on such a task on
+    `huggingface.co/models <https://huggingface.co/models?search=&filter=question-answering>`__.
+
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
     """
 
     default_input_names = "question,context"
+    task = "question-answering"
 
     def __init__(
         self,
-        model,
-        tokenizer: Optional[PreTrainedTokenizer],
+        model: Optional = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
         device: int = -1,
@@ -1003,23 +1240,77 @@ def pipeline(
     model: Optional = None,
     config: Optional[Union[str, PretrainedConfig]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-    modelcard: Optional[Union[str, ModelCard]] = None,
     framework: Optional[str] = None,
     **kwargs
 ) -> Pipeline:
     """
     Utility factory method to build a pipeline.
-    Pipeline are made of:
-        A Tokenizer instance in charge of mapping raw textual input to token
-        A Model instance
-        Some (optional) post processing for enhancing model's output
 
-    Examples:
+    Pipeline are made of:
+
+        - A Tokenizer instance in charge of mapping raw textual input to token
+        - A Model instance
+        - Some (optional) post processing for enhancing model's output
+
+
+    Args:
+        task (:obj:`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
+            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
+            - "ner": will return a :class:`~transformers.NerPipeline`
+            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
+            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
+            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained model configuration inheriting from
+            :class:`~transformers.PretrainedConfig`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+
+            If :obj:`None`, the default of the pipeline will be loaded.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+
+    Returns:
+        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
+        the task.
+
+    Examples::
+
+        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+
+        # Sentiment analysis pipeline
         pipeline('sentiment-analysis')
+
+        # Question answering pipeline, specifying the checkpoint identifier
         pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
-        pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
-        pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased')
-        pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
+
+        # Named entity recognition pipeline, passing in a specific model and tokenizer
+        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        pipeline('ner', model=model, tokenizer=tokenizer)
+
+        # Named entity recognition pipeline, passing a model and configuration with a HTTPS URL.
+        model_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/pytorch_model.bin"
+        config_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/config.json"
+        pipeline('ner', model=model_url, config=config_url, tokenizer='bert-base-cased')
     """
     # Retrieve the task
     if task not in SUPPORTED_TASKS:
@@ -1048,13 +1339,12 @@ def pipeline(
                 "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
             )
 
+    modelcard = None
     # Try to infer modelcard from model or config name (if provided as str)
-    if modelcard is None:
-        # Try to fallback on one of the provided string for model or config (will replace the suffix)
-        if isinstance(model, str):
-            modelcard = model
-        elif isinstance(config, str):
-            modelcard = config
+    if isinstance(model, str):
+        modelcard = model
+    elif isinstance(config, str):
+        modelcard = config
 
     # Instantiate tokenizer if needed
     if isinstance(tokenizer, (str, tuple)):
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index dd84ca229e..cd3c7a7699 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -2,9 +2,16 @@ import unittest
 from typing import Iterable, List, Optional
 
 from transformers import pipeline
-from transformers.pipelines import Pipeline
+from transformers.pipelines import (
+    FeatureExtractionPipeline,
+    FillMaskPipeline,
+    NerPipeline,
+    Pipeline,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+)
 
-from .utils import require_tf, require_torch
+from .utils import require_tf, require_torch, slow
 
 
 QA_FINETUNED_MODELS = [
@@ -304,3 +311,30 @@ class MultiColumnInputTestCase(unittest.TestCase):
         for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
             nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer, framework="tf")
             self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
+
+
+class PipelineCommonTests(unittest.TestCase):
+
+    pipelines = (
+        NerPipeline,
+        FeatureExtractionPipeline,
+        QuestionAnsweringPipeline,
+        FillMaskPipeline,
+        TextClassificationPipeline,
+    )
+
+    @slow
+    @require_tf
+    def test_tf_defaults(self):
+        # Test that pipelines can be correctly loaded without any argument
+        for default_pipeline in self.pipelines:
+            with self.subTest(msg="Testing Torch defaults with PyTorch and {}".format(default_pipeline.task)):
+                default_pipeline(framework="tf")
+
+    @slow
+    @require_torch
+    def test_pt_defaults(self):
+        # Test that pipelines can be correctly loaded without any argument
+        for default_pipeline in self.pipelines:
+            with self.subTest(msg="Testing Torch defaults with PyTorch and {}".format(default_pipeline.task)):
+                default_pipeline(framework="pt")

From f169957d0cf17b110f27cacc1b1fb43efaa01218 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 2 Mar 2020 15:45:25 -0500
Subject: [PATCH 35/80] TF GPU CI (#3085)

* debug env

* Restrict TF GPU memory

* Fixup

* One more test

* rm debug logs

* Fixup
---
 .github/workflows/self-push.yml      |  3 ++-
 .github/workflows/self-scheduled.yml |  2 +-
 tests/test_modeling_tf_common.py     | 16 ++++++++++++++--
 tests/utils.py                       | 14 ++++++++++++++
 utils/link_tester.py                 |  5 ++++-
 5 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index e21520d0e8..0bb5dbe87c 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -40,7 +40,8 @@ jobs:
 
     - name: Run all non-slow tests on GPU
       env:
-        TF_FORCE_GPU_ALLOW_GROWTH: yes
+        TF_FORCE_GPU_ALLOW_GROWTH: "true"
+        # TF_GPU_MEMORY_LIMIT: 4096
         OMP_NUM_THREADS: 1
         USE_CUDA: yes
       run: |
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index d963ab5920..0473949058 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -41,7 +41,7 @@ jobs:
 
     - name: Run all tests on GPU
       env:
-        TF_FORCE_GPU_ALLOW_GROWTH: yes
+        TF_FORCE_GPU_ALLOW_GROWTH: "true"
         OMP_NUM_THREADS: 1
         RUN_SLOW: yes
         USE_CUDA: yes
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index bcfb6bfe5d..e6f70d6bfa 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -21,14 +21,26 @@ import tempfile
 
 from transformers import is_tf_available, is_torch_available
 
-from .utils import require_tf
+from .utils import _tf_gpu_memory_limit, require_tf
 
 
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
 
-    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.experimental.set_virtual_device_configuration(
+                    gpu, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                )
+                logical_gpus = tf.config.experimental.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
 
 
 def _config_zero_init(config):
diff --git a/tests/utils.py b/tests/utils.py
index 163628d3a7..6036cefd9d 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -29,8 +29,22 @@ def parse_flag_from_env(key, default=False):
     return _value
 
 
+def parse_int_from_env(key, default=None):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        _value = default
+    else:
+        try:
+            _value = int(value)
+        except ValueError:
+            raise ValueError("If set, {} must be a int.".format(key))
+    return _value
+
+
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
 _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
+_tf_gpu_memory_limit = parse_int_from_env("TF_GPU_MEMORY_LIMIT", default=None)
 
 
 def slow(test_case):
diff --git a/utils/link_tester.py b/utils/link_tester.py
index 0ef165c401..ff53eb7049 100644
--- a/utils/link_tester.py
+++ b/utils/link_tester.py
@@ -14,6 +14,9 @@ import requests
 REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
 
 
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+
+
 def list_python_files_in_repository():
     """ List all python files in the repository.
 
@@ -36,7 +39,7 @@ def find_all_links(file_paths):
     for path in file_paths:
         links += scan_code_for_links(path)
 
-    return links
+    return [link for link in links if link != S3_BUCKET_PREFIX]
 
 
 def scan_code_for_links(source):

From 6b1558bad877af56b41c4b72874ea683441eafc7 Mon Sep 17 00:00:00 2001
From: Felix MIKAELIAN <39884124+fmikaelian@users.noreply.github.com>
Date: Mon, 2 Mar 2020 23:07:13 +0100
Subject: [PATCH 36/80] add models cards for camembert-base-fquad
 camembert-base-squad (#3089)

* add models cards for camembert-base-fquad camembert-base-squad

* typo fix
---
 .../fmikaelian/camembert-base-fquad/README.md | 49 +++++++++++++++++++
 .../fmikaelian/camembert-base-squad/README.md | 49 +++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 model_cards/fmikaelian/camembert-base-fquad/README.md
 create mode 100644 model_cards/fmikaelian/camembert-base-squad/README.md

diff --git a/model_cards/fmikaelian/camembert-base-fquad/README.md b/model_cards/fmikaelian/camembert-base-fquad/README.md
new file mode 100644
index 0000000000..e37e3fa4a3
--- /dev/null
+++ b/model_cards/fmikaelian/camembert-base-fquad/README.md
@@ -0,0 +1,49 @@
+---
+language: french
+---
+
+# camembert-base-fquad
+
+## Description
+
+A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [FQuAD](https://fquad.illuin.tech/))
+
+## Training hyperparameters
+
+```shell
+python3 ./examples/run_squad.py \
+--model_type camembert \
+--model_name_or_path camembert-base \
+--do_train \
+--do_eval \
+--do_lower_case \
+--train_file train.json \
+--predict_file valid.json \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir output \
+--per_gpu_eval_batch_size=3 \
+--per_gpu_train_batch_size=3 \
+--save_steps 10000
+``` 
+
+## Evaluation results
+
+```shell
+{"f1": 77.24515316052342, "exact_match": 52.82308657465496}
+```
+
+## Usage
+
+```python
+from transformers import pipeline
+
+nlp = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad')
+
+nlp({
+    'question': "Qui est Claude Monet?",
+    'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
+})
+```
\ No newline at end of file
diff --git a/model_cards/fmikaelian/camembert-base-squad/README.md b/model_cards/fmikaelian/camembert-base-squad/README.md
new file mode 100644
index 0000000000..a6c9f54374
--- /dev/null
+++ b/model_cards/fmikaelian/camembert-base-squad/README.md
@@ -0,0 +1,49 @@
+---
+language: french
+---
+
+# camembert-base-squad
+
+## Description
+
+A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD))
+
+## Training hyperparameters
+
+```shell
+python3 ./examples/run_squad.py \
+--model_type camembert \
+--model_name_or_path camembert-base \
+--do_train \
+--do_eval \
+--do_lower_case \
+--train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \
+--predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir output3 \
+--per_gpu_eval_batch_size=3 \
+--per_gpu_train_batch_size=3 \
+--save_steps 10000
+``` 
+
+## Evaluation results
+
+```shell
+{"f1": 79.8570684959745, "exact_match": 59.21327108373895}
+```
+
+## Usage
+
+```python
+from transformers import pipeline
+
+nlp = pipeline('question-answering', model='fmikaelian/camembert-base-squad', tokenizer='fmikaelian/camembert-base-squad')
+
+nlp({
+    'question': "Qui est Claude Monet?",
+    'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
+})
+```
\ No newline at end of file

From eec5ec807135ae61fa2266f3c7ad947cc207abda Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 2 Mar 2020 18:56:17 -0500
Subject: [PATCH 37/80] [BART] to each its own config + make BART compatible w/
 Pipelines

cc @sshleifer
---
 src/transformers/configuration_bart.py | 5 ++---
 src/transformers/pipelines.py          | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
index 7eb3bd7fe8..a80e743c42 100644
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -22,10 +22,9 @@ from .configuration_utils import PretrainedConfig
 
 logger = logging.getLogger(__name__)
 
-_bart_large_url = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json"
 BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bart-large": _bart_large_url,
-    "bart-large-mnli": _bart_large_url,  # fine as same
+    "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json",
+    "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json",
     "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json",
 }
 
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index cd7b9ca55d..7b73a70d4d 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -28,6 +28,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
+from .configuration_bart import BartConfig
 from .configuration_distilbert import DistilBertConfig
 from .configuration_roberta import RobertaConfig
 from .configuration_utils import PretrainedConfig
@@ -427,7 +428,7 @@ class Pipeline(_ScikitCompat):
         """
         args = ["input_ids", "attention_mask"]
 
-        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)):
+        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig, BartConfig)):
             args += ["token_type_ids"]
 
         # PR #1548 (CLI) There is an issue with attention_mask

From c0c7ec3458f95f41c82737b14cd2637c06bca2e3 Mon Sep 17 00:00:00 2001
From: Davide Fiocco <davidefiocco@users.noreply.github.com>
Date: Tue, 3 Mar 2020 14:59:47 +0100
Subject: [PATCH 38/80] Don't crash if fine-tuned model doesn't end with a
 number (#3099)

That's the same fix applied in https://github.com/huggingface/transformers/issues/2258 , but for the GLUE example
---
 examples/run_glue.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index f5bbde9031..57d1c56ac1 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -183,8 +183,11 @@ def train(args, train_dataset, model, tokenizer):
     steps_trained_in_current_epoch = 0
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
-        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        # set global_step to global_step of last saved checkpoint from model path
+        try:
+            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        except ValueError:
+            global_step = 0
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 

From b31f7150190cdf13950607f8ee1efe11b352c909 Mon Sep 17 00:00:00 2001
From: ali safaya <alisafaya@gmail.com>
Date: Tue, 3 Mar 2020 16:30:10 +0300
Subject: [PATCH 39/80] bert-base-arabic model card

---
 .../asafaya/base-bert-arabic/README.md        | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 model_cards/asafaya/base-bert-arabic/README.md

diff --git a/model_cards/asafaya/base-bert-arabic/README.md b/model_cards/asafaya/base-bert-arabic/README.md
new file mode 100644
index 0000000000..fc071dfae0
--- /dev/null
+++ b/model_cards/asafaya/base-bert-arabic/README.md
@@ -0,0 +1,44 @@
+# Arabic BERT Model
+
+Pretrained BERT base language model for Arabic
+
+## Pretraining Corpus
+
+`arabic-bert-base` model was pretrained on ~8.2 Billion words:
+
+- Arabic version of [OSCAR](https://traces1.inria.fr/oscar/) - filtered from [Common Crawl](http://commoncrawl.org/)
+- Recent dump of Arabic [Wikipedia](https://dumps.wikimedia.org/backup-index.html)
+
+and other Arabic resources which sum up to ~95GB of text.
+
+__Notes on training data:__
+
+- Our final version of corpus contains some non-Arabic words inlines, which we did not remove from sentences since that would affect some tasks like NER.
+- Although non-Arabic characters were lowered as a preprocessing step, since Arabic characters does not have upper or lower case, there is no cased and uncased version of the model.
+- The corpus and vocabulary set are not restricted to Modern Standard Arabic, they contain some dialectical Arabic too.
+
+## Pretraining details
+
+- This model was trained using Google BERT's github [repository](https://github.com/google-research/bert) on a single TPU v3-8 provided for free from [TFRC](https://www.tensorflow.org/tfrc).
+- Our pretraining procedure follows training settings of bert with some changes: trained for 3M training steps with batchsize of 128, instead of 1M with batchsize of 256.
+
+## Load Pretrained Model
+
+You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:  
+
+```python
+from transformers import AutoTokenizer, AutoModel
+
+tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
+model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
+```
+
+## Results
+
+For further details on the models performance or any other queries, please refer to [Arabic-BERT](https://github.com/alisafaya/Arabic-BERT)
+
+## Acknowledgement
+
+Thanks to Google for providing free TPU for the training process and for Huggingface for hosting this model on their servers 😊
+
+

From 4134100363e878693aa41f4a25a667ca46d80a9e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 3 Mar 2020 15:42:15 +0100
Subject: [PATCH 40/80] Add generate() functionality to TF 2.0 (#3063)

* add first copy past test to tf 2 generate

* add tf top_k_top_p_filter fn

* add generate function for TF

* add generate function for TF

* implemented generate for all models expect transfoXL

* implemented generate for all models expect transfoXL

* implemented generate for all models expect transfoXL

* make style

* change permission of test file to correct ones

* delete ipdb

* delete ipdb

* fix bug and finish simple gpt2 integration test

* clean test file

* clean test file

* make style

* make style

* make style

* make style

* change import style

* change import style

* make style

* make style

* add decorators

* add decorators

* fix tf ctrl bug dim => axis in TF

* make style

* make style

* refactored test file

* refactored test file

* take out test_torch_tf_conversion if nothing is defined

* take out test_torch_tf_conversion if nothing is defined

* remove useless files

* remove useless files

* fix conflicts

* fix conflicts

* fix conflicts

* fix conflicts

* fix conflicts

* solve conflicts

* solve conflicts

* fix conflicts

* fix conflicts

* merge conflicts

* delete ipdb

* exposed top_k_top_p_filtering fns

* delete weirdly created w! file

* add comment to test tf common modeling

* fix conflicts

* fix conflicts

* make style

* merge conflicts

* make style

* change tf.tensor.shape to shape_list(tensor)
---
 src/transformers/__init__.py               |  10 +-
 src/transformers/modeling_ctrl.py          |   8 +-
 src/transformers/modeling_gpt2.py          |   8 +-
 src/transformers/modeling_tf_ctrl.py       |  11 +-
 src/transformers/modeling_tf_gpt2.py       |   7 +
 src/transformers/modeling_tf_transfo_xl.py |   9 +
 src/transformers/modeling_tf_utils.py      | 426 +++++++++++++++++++++
 src/transformers/modeling_tf_xlm.py        |  14 +
 src/transformers/modeling_tf_xlnet.py      |  26 ++
 src/transformers/modeling_transfo_xl.py    |   6 +-
 src/transformers/modeling_xlnet.py         |   6 +-
 tests/test_modeling_common.py              | 136 ++++++-
 tests/test_modeling_gpt2.py                |  40 +-
 tests/test_modeling_tf_common.py           | 143 ++++++-
 tests/test_modeling_tf_ctrl.py             |   1 +
 tests/test_modeling_tf_gpt2.py             |  77 +++-
 tests/test_modeling_tf_openai_gpt.py       |   3 +
 tests/test_modeling_tf_transfo_xl.py       |   5 +
 tests/test_modeling_tf_xlm.py              |   6 +
 tests/test_modeling_tf_xlnet.py            |  12 +
 20 files changed, 892 insertions(+), 62 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1959b254d3..b338ad8515 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -136,7 +136,7 @@ if is_sklearn_available():
 
 # Modeling
 if is_torch_available():
-    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering
     from .modeling_auto import (
         AutoModel,
         AutoModelForPreTraining,
@@ -291,7 +291,13 @@ if is_torch_available():
 
 # TensorFlow
 if is_tf_available():
-    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
+    from .modeling_tf_utils import (
+        TFPreTrainedModel,
+        TFSharedEmbeddings,
+        TFSequenceSummary,
+        shape_list,
+        tf_top_k_top_p_filtering,
+    )
     from .modeling_tf_auto import (
         TFAutoModel,
         TFAutoModelForPreTraining,
diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py
index 40e076a498..f9c6202861 100644
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -454,14 +454,12 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past, **kwargs):
         # only last token for inputs_ids if past is defined in kwargs
-        if "past" in kwargs and kwargs["past"]:
+        if past:
             input_ids = input_ids[:, -1].unsqueeze(-1)
 
-        inputs = {"input_ids": input_ids}
-        inputs.update(kwargs)
-        return inputs
+        return {"input_ids": input_ids, "past": past}
 
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
     def forward(
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 479f459d2c..b492d7fc37 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -525,14 +525,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past, **kwargs):
         # only last token for inputs_ids if past is defined in kwargs
-        if "past" in kwargs and kwargs["past"]:
+        if past:
             input_ids = input_ids[:, -1].unsqueeze(-1)
 
-        inputs = {"input_ids": input_ids}
-        inputs.update(kwargs)
-        return inputs
+        return {"input_ids": input_ids, "past": past}
 
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
     def forward(
diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py
index 78e0c1113a..335421979c 100644
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -105,8 +105,8 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         v = self.split_into_heads(v, batch_size)
         if layer_past is not None:
             past_key, past_value = tf.unstack(layer_past, axis=1)
-            k = tf.concat((past_key, k), dim=-2)
-            v = tf.concat((past_value, v), dim=-2)
+            k = tf.concat((past_key, k), axis=-2)
+            v = tf.concat((past_value, v), axis=-2)
         present = tf.stack((k, v), axis=1)
 
         output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
@@ -505,6 +505,13 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head.input_embeddings
 
+    def prepare_inputs_for_generation(self, inputs, past, **kwargs):
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+
+        return {"inputs": inputs, "past": past}
+
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
     def call(self, inputs, **kwargs):
         r"""
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index 96a064a332..7e9b102b6d 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -500,6 +500,13 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
     def get_output_embeddings(self):
         return self.transformer.wte
 
+    def prepare_inputs_for_generation(self, inputs, past, **kwargs):
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+
+        return {"inputs": inputs, "past": past}
+
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
     def call(self, inputs, **kwargs):
         r"""
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index 659685388e..098a4c9143 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -826,3 +826,12 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
             outputs = [softmax_output] + outputs
 
         return outputs  # logits, new_mems, (all hidden states), (all attentions)
+
+    def prepare_inputs_for_generation(self, inputs, past, **model_kwargs):
+        inputs = {"inputs": inputs}
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past:
+            inputs["mems"] = past
+
+        return inputs
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 452d377cd5..43abdd9499 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -384,6 +384,432 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
 
         return model
 
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        return {"inputs": inputs}
+
+    def _do_output_past(self, outputs):
+        has_output_past = hasattr(self.config, "output_past") and self.config.output_past
+        has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
+
+        if has_output_past and not has_mem_len and len(outputs) > 1:
+            return True
+        elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
+            return True
+
+        return False
+
+    def generate(
+        self,
+        input_ids=None,
+        max_length=None,
+        do_sample=True,
+        num_beams=None,
+        temperature=None,
+        top_k=None,
+        top_p=None,
+        repetition_penalty=None,
+        bos_token_id=None,
+        pad_token_id=None,
+        eos_token_ids=None,
+        length_penalty=None,
+        num_return_sequences=None,
+    ):
+        r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
+        and beam-search.
+
+        Adapted in part from `Facebook's XLM beam search code`_.
+
+        .. _`Facebook's XLM beam search code`:
+           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
+
+
+        Parameters:
+
+            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
+                The sequence used as a prompt for the generation. If `None` the method initializes
+                it as an empty `torch.LongTensor` of shape `(1,)`.
+
+            max_length: (`optional`) int
+                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
+
+            do_sample: (`optional`) bool
+                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `True`.
+
+            num_beams: (`optional`) int
+                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
+
+            temperature: (`optional`) float
+                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
+
+            top_k: (`optional`) int
+                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
+
+            top_p: (`optional`) float
+                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
+
+            repetition_penalty: (`optional`) float
+                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
+
+            bos_token_id: (`optional`) int
+                Beginning of sentence token if no prompt is provided. Default to 0.
+
+            eos_token_ids: (`optional`) int or list of int
+                End of sequence token or list of tokens to stop the generation. Default to 0.
+            length_penalty: (`optional`) float
+                Exponential penalty to the length. Default to 1.
+
+            num_return_sequences: (`optional`) int
+                The number of independently computed returned sequences for each element in the batch. Default to 1.
+
+        Return:
+
+            output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
+                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`
+
+        Examples::
+
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            outputs = model.generate(max_length=40, bos_token_id=tokenizer.bos_token_id, eos_token_ids=tokenizer.eos_token_id, do_sample=False)  # do greedy decoding
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
+            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.pad_token_id, eos_token_ids=tokenizer.eos_token_id, num_return_sequences=3)  # 3 generate sequences using by sampling
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
+            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+
+        """
+
+        # We cannot generate if the model does not have a LM head
+        if self.get_output_embeddings() is None:
+            raise AttributeError(
+                "You tried to generate sequences with a model that does not have a LM Head."
+                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`)"
+            )
+
+        max_length = max_length if max_length is not None else self.config.max_length
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        temperature = temperature if temperature is not None else self.config.temperature
+        top_k = top_k if top_k is not None else self.config.top_k
+        top_p = top_p if top_p is not None else self.config.top_p
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+
+        if input_ids is not None:
+            batch_size = shape_list(input_ids)[0]  # overriden by the input batch_size
+        else:
+            batch_size = 1
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
+        assert temperature > 0, "`temperature` should be strictely positive."
+        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
+        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+        assert input_ids is not None or (
+            isinstance(bos_token_id, int) and bos_token_id >= 0
+        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
+        assert pad_token_id is None or (
+            isinstance(pad_token_id, int) and (pad_token_id >= 0)
+        ), "`pad_token_id` should be a positive integer."
+        assert (eos_token_ids is None) or (
+            isinstance(eos_token_ids, (list, tuple)) and ((isinstance(e, int) and e >= 0) for e in eos_token_ids)
+        ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
+        assert length_penalty > 0, "`length_penalty` should be strictely positive."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a strictely positive integer."
+
+        if input_ids is None:
+            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
+                "you should either supply a context to complete as `input_ids` input "
+                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+            )
+            input_ids = tf.fill((batch_size, 1), bos_token_id)
+        else:
+            assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)."
+
+        if pad_token_id is None and eos_token_ids is not None:
+            logger.warning(
+                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_ids[0])
+            )
+            pad_token_id = eos_token_ids[0]
+
+        # current position and vocab size
+        cur_len = shape_list(input_ids)[1]
+        vocab_size = self.config.vocab_size
+
+        if num_return_sequences != 1:
+            # Expand input to num return sequences
+            input_ids = tf.broadcast_to(tf.expand_dims(input_ids, 1), (batch_size, num_return_sequences, cur_len))
+            effective_batch_size = batch_size * num_return_sequences
+            input_ids = tf.reshape(input_ids, (effective_batch_size, cur_len))
+        else:
+            effective_batch_size = batch_size
+
+        if num_beams > 1:
+            output = self._generate_beam_search(
+                input_ids,
+                cur_len,
+                max_length,
+                do_sample,
+                temperature,
+                top_k,
+                top_p,
+                repetition_penalty,
+                pad_token_id,
+                eos_token_ids,
+                effective_batch_size,
+                length_penalty,
+                num_beams,
+                vocab_size,
+            )
+        else:
+            output = self._generate_no_beam_search(
+                input_ids,
+                cur_len,
+                max_length,
+                do_sample,
+                temperature,
+                top_k,
+                top_p,
+                repetition_penalty,
+                pad_token_id,
+                eos_token_ids,
+                effective_batch_size,
+            )
+
+        return output
+
+    def _generate_no_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        pad_token_id,
+        eos_token_ids,
+        batch_size,
+    ):
+        """ Generate sequences for each example without beam search (num_beams == 1).
+            All returned sequence are generated independantly.
+        """
+
+        def _create_next_token_logits_penalties(input_ids, logits):
+            # create logit penalties for already seen input_ids
+            token_penalties = np.ones(shape_list(logits))
+            prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
+            for i, prev_input_id in enumerate(prev_input_ids):
+                logit_penalized = logits[i].numpy()[prev_input_id]
+                # if previous logit score is < 0 then multiply repetition penalty else divide
+                logit_penalized[logit_penalized < 0] = repetition_penalty
+                logit_penalized[logit_penalized > 0] = 1 / repetition_penalty
+                np.put(token_penalties[i], prev_input_id, logit_penalized)
+            return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
+
+        # current position / max lengths / length of generated sentences / unfinished sentences
+        unfinished_sents = tf.ones_like(input_ids[:, 0])
+        sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
+
+        past = None
+
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
+            outputs = self(**model_inputs)
+            next_token_logits = outputs[0][:, -1, :]
+
+            # if model has past, then set the past variable to speed up decoding
+            if self._do_output_past(outputs):
+                past = outputs[1]
+
+            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
+            if repetition_penalty != 1.0:
+                next_token_logits_penalties = _create_next_token_logits_penalties(input_ids, next_token_logits)
+                next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
+
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature != 1.0:
+                    next_token_logits = next_token_logits / temperature
+                # Top-p/top-k filtering
+                next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+                # Sample
+                next_token = tf.squeeze(
+                    tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1
+                )
+            else:
+                # Greedy decoding
+                next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32)
+
+            # update generations and finished sentences
+            if eos_token_ids is not None:
+                # pad finished sentences if eos_token_ids exist
+                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
+            else:
+                tokens_to_add = next_token
+
+            input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1)
+
+            if eos_token_ids is not None:
+                for eos_token_id in eos_token_ids:
+                    eos_in_sents = tokens_to_add == eos_token_id
+                    # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
+                    is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply(
+                        unfinished_sents, tf.cast(eos_in_sents, tf.int32)
+                    )
+                    sent_lengths = (
+                        sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos)
+                        + cur_len * is_sents_unfinished_and_token_to_add_is_eos
+                    )
+
+                    # unfinished_sents is set to zero if eos in sentence
+                    unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos
+
+            cur_len = cur_len + 1
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if tf.math.reduce_max(unfinished_sents) == 0:
+                break
+
+        # if there are different sentences lengths in the batch, some batches have to be padded
+        min_sent_length = tf.math.reduce_min(sent_lengths)
+        max_sent_length = tf.math.reduce_max(sent_lengths)
+        if min_sent_length != max_sent_length:
+            assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
+            # finished sents are filled with pad_token
+            padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id
+
+            # create length masks for tf.where operation
+            broad_casted_sent_lengths = tf.broadcast_to(
+                tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length]
+            )
+            broad_casted_range = tf.transpose(
+                tf.broadcast_to(tf.expand_dims(tf.range(max_length), -1), [max_length, batch_size])
+            )
+
+            decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding)
+        else:
+            decoded = input_ids
+
+        return decoded
+
+    def _generate_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        pad_token_id,
+        eos_token_ids,
+        batch_size,
+        length_penalty,
+        num_beams,
+        vocab_size,
+    ):
+        pass
+
+
+def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (batch size, vocabulary size)
+            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+            Make sure we keep at least min_tokens_to_keep per batch example in the output
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    logits_shape = shape_list(logits)
+
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1])  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None]
+        logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
+
+    if top_p < 1.0:
+        sorted_indices = tf.argsort(logits, direction="DESCENDING")
+        sorted_logits = tf.gather(
+            logits, sorted_indices, axis=-1, batch_dims=1
+        )  # expects logits to be of dim (batch_size, vocab_size)
+
+        cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
+
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove = tf.concat(
+                [
+                    tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]),
+                    sorted_indices_to_remove[:, min_tokens_to_keep:],
+                ],
+                -1,
+            )
+
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
+        sorted_indices_to_remove = tf.concat(
+            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1,
+        )
+        # scatter sorted tensors to original indexing
+        indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
+        logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
+    return logits
+
+
+def scatter_values_on_batch_indices(values, batch_indices):
+    shape = shape_list(batch_indices)
+    # broadcast batch dim to shape
+    broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1])
+    # transform batch_indices to pair_indices
+    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
+    # scatter values to pair indices
+    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)
+
+
+def set_tensor_by_indices_to_value(tensor, indices, value):
+    # create value_tensor since tensor value assignment is not possible in TF
+    value_tensor = tf.zeros_like(tensor) + value
+    return tf.where(indices, value_tensor, tensor)
+
 
 class TFConv1D(tf.keras.layers.Layer):
     def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index 44b991d08c..6e94a7206e 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -657,6 +657,20 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
     def get_output_embeddings(self):
         return self.pred_layer.input_embeddings
 
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = inputs.shape[0]
+        mask_token = tf.ones((effective_batch_size, 1), dtype=tf.int32) * mask_token_id
+        inputs = tf.concat([inputs, mask_token], axis=1)
+
+        if lang_id is not None:
+            langs = tf.ones_like(inputs) * lang_id
+        else:
+            langs = None
+        return {"inputs": inputs, "langs": langs}
+
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
     def call(self, inputs, **kwargs):
         r"""
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index d9ced75384..87ebe16858 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -837,6 +837,32 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_loss.input_embeddings
 
+    def prepare_inputs_for_generation(self, inputs, past, **model_kwargs):
+        # Add dummy token at the end (no attention on this one)
+
+        effective_batch_size = inputs.shape[0]
+        dummy_token = tf.zeros((effective_batch_size, 1), dtype=tf.int32)
+        inputs = tf.concat([inputs, dummy_token], axis=1)
+
+        # Build permutation mask so that previous tokens don't see last token
+        sequence_length = inputs.shape[1]
+        perm_mask = tf.zeros((effective_batch_size, sequence_length, sequence_length - 1), dtype=tf.float32)
+        perm_mask_seq_end = tf.ones((effective_batch_size, sequence_length, 1), dtype=tf.float32)
+        perm_mask = tf.concat([perm_mask, perm_mask_seq_end], axis=-1)
+
+        # We'll only predict the last token
+        target_mapping = tf.zeros((effective_batch_size, 1, sequence_length - 1), dtype=tf.float32)
+        target_mapping_seq_end = tf.ones((effective_batch_size, 1, 1), dtype=tf.float32)
+        target_mapping = tf.concat([target_mapping, target_mapping_seq_end], axis=-1)
+
+        inputs = {"inputs": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping}
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past:
+            inputs["mems"] = past
+
+        return inputs
+
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
     def call(self, inputs, **kwargs):
         r"""
diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py
index 3d95d6e70f..379b650bea 100644
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -935,11 +935,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         else:
             return self.crit.out_layers[-1]
 
-    def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs):
         inputs = {"input_ids": input_ids}
 
         # if past is defined in model kwargs then use it for faster decoding
-        if "past" in model_kwargs and model_kwargs["past"]:
-            inputs["mems"] = model_kwargs["past"]
+        if past:
+            inputs["mems"] = past
 
         return inputs
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index 531b0f9a4c..7d34e7ef2b 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -935,7 +935,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_loss
 
-    def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs):
         # Add dummy token at the end (no attention on this one)
 
         effective_batch_size = input_ids.shape[0]
@@ -958,8 +958,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         inputs = {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
 
         # if past is defined in model kwargs then use it for faster decoding
-        if "past" in model_kwargs and model_kwargs["past"]:
-            inputs["mems"] = model_kwargs["past"]
+        if past:
+            inputs["mems"] = past
 
         return inputs
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 5277864eca..9ba00d2421 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -36,6 +36,7 @@ if is_torch_available():
         BertModel,
         BertConfig,
         BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        top_k_top_p_filtering,
     )
 
 
@@ -263,7 +264,7 @@ class ModelTesterMixin:
             # Prepare head_mask
             # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
             head_mask = torch.ones(
-                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
+                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device,
             )
             head_mask[0, 0] = 0
             head_mask[-1, :-1] = 0
@@ -303,7 +304,7 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -313,7 +314,10 @@ class ModelTesterMixin:
             model = model_class(config=config)
             model.to(torch_device)
             model.eval()
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
             model.prune_heads(heads_to_prune)
             with torch.no_grad():
                 outputs = model(**inputs_dict)
@@ -329,7 +333,7 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -339,7 +343,10 @@ class ModelTesterMixin:
             model = model_class(config=config)
             model.to(torch_device)
             model.eval()
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
             model.prune_heads(heads_to_prune)
 
             with tempfile.TemporaryDirectory() as temp_dir_name:
@@ -359,7 +366,7 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -367,7 +374,10 @@ class ModelTesterMixin:
             config.output_attentions = True
             config.output_hidden_states = False
 
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
             config.pruned_heads = heads_to_prune
 
             model = model_class(config=config)
@@ -387,7 +397,7 @@ class ModelTesterMixin:
             return
 
         for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            (config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
 
             if "head_mask" in inputs_dict:
                 del inputs_dict["head_mask"]
@@ -465,7 +475,7 @@ class ModelTesterMixin:
             )
 
     def test_resize_tokens_embeddings(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        (original_config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
             return
 
@@ -634,6 +644,7 @@ class ModelTesterMixin:
             self._check_generated_tokens(model.generate(input_ids, num_return_sequences=3))
             # batch_size > 1, greedy
             self._check_generated_tokens(model.generate(input_ids, do_sample=False))
+
             # batch_size > 1, num_beams > 1, sample
             self._check_generated_tokens(model.generate(input_ids, num_beams=3, num_return_sequences=3,))
             # batch_size > 1, num_beams > 1, greedy
@@ -704,3 +715,110 @@ class ModelUtilsTest(unittest.TestCase):
             self.assertEqual(model.config.output_attentions, True)
             self.assertEqual(model.config.output_hidden_states, True)
             self.assertEqual(model.config, config)
+
+
+@require_torch
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = torch.tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        non_inf_expected_idx = torch.tensor(
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            dtype=torch.long,
+            device=torch_device,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = torch.tensor(
+            [
+                8.2221,
+                7.3534,
+                8.4321,
+                7.4402,
+                9.3845,
+                6.2712,
+                8.8275,
+                5.4403,
+                7.3858,
+                9.6770,
+            ],  # expected non filtered values as noted above
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        output = top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+        non_inf_output = output[output != -float("inf")].to(device=torch_device)
+        non_inf_idx = (output != -float("inf")).nonzero().to(device=torch_device)
+
+        self.assertTrue(torch.allclose(non_inf_expected_output, non_inf_output, atol=1e-12))
+        self.assertTrue(torch.all(torch.eq(non_inf_expected_idx, non_inf_idx)))
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 21fc873234..e705b80f8b 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -386,33 +386,33 @@ class GPT2ModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_distilgpt2(self):
         model = GPT2LMHeadModel.from_pretrained("distilgpt2")
-        input_ids = torch.Tensor([[464, 3290, 318, 13779]]).long()  # The dog is cute
+        input_ids = torch.Tensor([[464, 1893]]).long()  # The president
         expected_output_ids = [
             464,
-            3290,
-            318,
-            13779,
-            996,
-            339,
-            460,
-            3360,
-            655,
-            2513,
+            1893,
+            286,
+            262,
+            1578,
+            1829,
+            11,
+            290,
+            262,
+            1893,
+            286,
+            262,
+            1578,
+            7526,
+            11,
+            423,
+            587,
             287,
             262,
-            3952,
-            13,
-            632,
-            318,
-            407,
-            845,
-            3621,
-            284,
-        ]  # The dog is cute though he can sometimes just walk in the park. It is not very nice to
-        torch.manual_seed(0)
+            2635,
+        ]  # The president of the United States, and the president of the United Kingdom, have been in the White
 
         output_ids = model.generate(
             input_ids,
+            do_sample=False,
             bos_token_id=self.special_tokens["bos_token_id"],
             eos_token_ids=self.special_tokens["eos_token_id"],
         )
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index e6f70d6bfa..8cd53dfe19 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -18,6 +18,7 @@ import copy
 import os
 import random
 import tempfile
+import unittest
 
 from transformers import is_tf_available, is_torch_available
 
@@ -28,6 +29,8 @@ if is_tf_available():
     import tensorflow as tf
     import numpy as np
 
+    from transformers import tf_top_k_top_p_filtering
+
     if _tf_gpu_memory_limit is not None:
         gpus = tf.config.list_physical_devices("GPU")
         for gpu in gpus:
@@ -56,6 +59,7 @@ class TFModelTesterMixin:
 
     model_tester = None
     all_model_classes = ()
+    all_generative_model_classes = ()
     test_torchscript = True
     test_pruning = True
     test_resize_embeddings = True
@@ -216,7 +220,7 @@ class TFModelTesterMixin:
             outputs_dict = model(inputs_dict)
 
             inputs_keywords = copy.deepcopy(inputs_dict)
-            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None)
+            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None,)
             outputs_keywords = model(input_ids, **inputs_keywords)
 
             output_dict = outputs_dict[0].numpy()
@@ -299,7 +303,7 @@ class TFModelTesterMixin:
             self.assertEqual(model.config.output_hidden_states, True)
             self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
             self.assertListEqual(
-                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
+                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size],
             )
 
     def test_model_common_attributes(self):
@@ -316,7 +320,10 @@ class TFModelTesterMixin:
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
+            first, second = (
+                model(inputs_dict, training=False)[0],
+                model(inputs_dict, training=False)[0],
+            )
             out_1 = first.numpy()
             out_2 = second.numpy()
             out_1 = out_1[~np.isnan(out_1)]
@@ -338,9 +345,9 @@ class TFModelTesterMixin:
                     x = wte([input_ids, None, None, None], mode="embedding")
                 except Exception:
                     if hasattr(self.model_tester, "embedding_size"):
-                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32,)
                     else:
-                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32,)
         return x
 
     def test_inputs_embeds(self):
@@ -366,6 +373,37 @@ class TFModelTesterMixin:
 
             model(inputs_dict)
 
+    def test_lm_head_model_random_generate(self):
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get(
+            "input_ids", None
+        )  # TODO (PVP): ugly workaround to make code work for t5 for the moment - has to changed when t5 is fixed.
+
+        for model_class in self.all_generative_model_classes:
+            # TODO (PVP): add beam search tests when beam search is implemented
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                with self.assertRaises(AssertionError):
+                    model.generate(max_length=5)
+                # batch_size = 1
+                self._check_generated_tokens(model.generate(input_ids))
+            else:
+                # batch_size = 1
+                self._check_generated_tokens(model.generate(max_length=5))
+                # batch_size = 1, num_beams > 1
+
+            # batch_size > 1, sample
+            self._check_generated_tokens(model.generate(input_ids, num_return_sequences=3))
+            # batch_size > 1, greedy
+            self._check_generated_tokens(model.generate(input_ids, do_sample=False, num_return_sequences=3))
+
+    def _check_generated_tokens(self, output_ids):
+        for token_id in output_ids[0].numpy().tolist():
+            self.assertGreaterEqual(token_id, 0)
+            self.assertLess(token_id, self.model_tester.vocab_size)
+
 
 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
@@ -383,3 +421,98 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
 
     return output
+
+
+@require_tf
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p_filtering function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = tf.convert_to_tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=tf.float32,
+        )
+
+        non_inf_expected_idx = tf.convert_to_tensor(
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]], dtype=tf.int32,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = tf.convert_to_tensor(
+            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
+            dtype=tf.float32,
+        )  # expected non filtered values as noted above
+
+        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+
+        non_inf_output = output[output != -float("inf")]
+        non_inf_idx = tf.cast(
+            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))), dtype=tf.int32,
+        )
+
+        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
+        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py
index 4997c2a573..29a6eb5d43 100644
--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -31,6 +31,7 @@ if is_tf_available():
 class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
 
     class TFCTRLModelTester(object):
         def __init__(
diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py
index d7b0809964..362f9e3162 100644
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -37,7 +37,7 @@ if is_tf_available():
 class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
-    # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()
 
     class TFGPT2ModelTester(object):
         def __init__(
@@ -89,6 +89,8 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
             self.num_labels = num_labels
             self.num_choices = num_choices
             self.scope = scope
+            self.bos_token_id = vocab_size - 1
+            self.eos_token_id = vocab_size - 1
 
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -123,9 +125,11 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                 # hidden_dropout_prob=self.hidden_dropout_prob,
                 # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
+                n_ctx=self.max_position_embeddings,
                 # type_vocab_size=self.type_vocab_size,
                 # initializer_range=self.initializer_range
+                bos_token_id=self.bos_token_id,
+                eos_token_ids=self.eos_token_id,
             )
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -144,7 +148,11 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2Model(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            inputs = {
+                "input_ids": input_ids,
+                "attention_mask": input_mask,
+                "token_type_ids": token_type_ids,
+            }
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -156,18 +164,22 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
             )
 
         def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2LMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            inputs = {
+                "input_ids": input_ids,
+                "attention_mask": input_mask,
+                "token_type_ids": token_type_ids,
+            }
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size],
             )
 
         def create_and_check_gpt2_double_head(
@@ -188,7 +200,7 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
             lm_logits, mc_logits = model(inputs)[:2]
             result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
             self.parent.assertListEqual(
-                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
             )
             self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
 
@@ -207,7 +219,11 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                 choice_labels,
             ) = config_and_inputs
 
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            inputs_dict = {
+                "input_ids": input_ids,
+                "token_type_ids": token_type_ids,
+                "attention_mask": input_mask,
+            }
             return config, inputs_dict
 
     def setUp(self):
@@ -234,3 +250,48 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
         for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
+
+
+def prepare_generation_special_tokens():
+    return {"bos_token_id": 50256, "eos_token_id": 50256}
+
+
+class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
+
+    special_tokens = prepare_generation_special_tokens()
+
+    @slow
+    def test_lm_generate_distilgpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
+        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
+        expected_output_ids = [
+            464,
+            1893,
+            286,
+            262,
+            1578,
+            1829,
+            11,
+            290,
+            262,
+            1893,
+            286,
+            262,
+            1578,
+            7526,
+            11,
+            423,
+            587,
+            287,
+            262,
+            2635,
+        ]  # The president of the United States, and the president of the United Kingdom, have been in the White
+
+        output_ids = model.generate(
+            input_ids,
+            do_sample=False,
+            bos_token_id=self.special_tokens["bos_token_id"],
+            eos_token_ids=self.special_tokens["eos_token_id"],
+        )
+
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/test_modeling_tf_openai_gpt.py b/tests/test_modeling_tf_openai_gpt.py
index b825c94fca..b8bf74f88a 100644
--- a/tests/test_modeling_tf_openai_gpt.py
+++ b/tests/test_modeling_tf_openai_gpt.py
@@ -39,6 +39,9 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
     )
+    all_generative_model_classes = (
+        (TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
 
     class TFOpenAIGPTModelTester(object):
         def __init__(
diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py
index f94f2032a2..f2d8e58362 100644
--- a/tests/test_modeling_tf_transfo_xl.py
+++ b/tests/test_modeling_tf_transfo_xl.py
@@ -37,6 +37,8 @@ if is_tf_available():
 class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
+    all_generative_model_classes = () if is_tf_available() else ()
+    # TODO: add this test when TFTransfoXLLMHead has a linear output layer implemented
     test_pruning = False
     test_torchscript = False
     test_resize_embeddings = False
@@ -62,6 +64,7 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
             num_hidden_layers=5,
             scope=None,
             seed=1,
+            eos_token_id=0,
         ):
             self.parent = parent
             self.batch_size = batch_size
@@ -82,6 +85,7 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
             self.num_hidden_layers = num_hidden_layers
             self.scope = scope
             self.seed = seed
+            self.eos_token_id = eos_token_id
 
         def prepare_config_and_inputs(self):
             input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -103,6 +107,7 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
                 d_inner=self.d_inner,
                 div_val=self.div_val,
                 n_layer=self.num_hidden_layers,
+                eos_token_ids=self.eos_token_id,
             )
 
             return (config, input_ids_1, input_ids_2, lm_labels)
diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py
index 53719f63f4..ebadd074e6 100644
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -43,6 +43,9 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    all_generative_model_classes = (
+        (TFXLMWithLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
 
     class TFXLMModelTester(object):
         def __init__(
@@ -75,6 +78,7 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
             summary_type="last",
             use_proj=True,
             scope=None,
+            bos_token_id=0,
         ):
             self.parent = parent
             self.batch_size = batch_size
@@ -105,6 +109,7 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
             self.num_labels = num_labels
             self.num_choices = num_choices
             self.scope = scope
+            self.bos_token_id = bos_token_id
 
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -145,6 +150,7 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
                 initializer_range=self.initializer_range,
                 summary_type=self.summary_type,
                 use_proj=self.use_proj,
+                bos_token_id=self.bos_token_id,
             )
 
             return (
diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py
index 65c83395e5..687fe01575 100644
--- a/tests/test_modeling_tf_xlnet.py
+++ b/tests/test_modeling_tf_xlnet.py
@@ -51,6 +51,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    all_generative_model_classes = (
+        (TFXLNetLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
     test_pruning = False
 
     class TFXLNetModelTester(object):
@@ -77,6 +80,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
             initializer_range=0.05,
             seed=1,
             type_vocab_size=2,
+            bos_token_id=1,
+            eos_token_id=2,
+            pad_token_id=5,
         ):
             self.parent = parent
             self.batch_size = batch_size
@@ -100,6 +106,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
             self.seed = seed
             self.type_vocab_size = type_vocab_size
             self.type_sequence_label_size = type_sequence_label_size
+            self.bos_token_id = bos_token_id
+            self.pad_token_id = pad_token_id
+            self.eos_token_id = eos_token_id
 
         def prepare_config_and_inputs(self):
             input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -139,6 +148,9 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
                 bi_data=self.bi_data,
                 initializer_range=self.initializer_range,
                 num_labels=self.type_sequence_label_size,
+                bos_token_id=self.bos_token_id,
+                pad_token_id=self.pad_token_id,
+                eos_token_id=self.eos_token_id,
             )
 
             return (

From a088d75e510d5641808ccd72f5dca4df36d95b8e Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 3 Mar 2020 09:52:32 -0500
Subject: [PATCH 41/80] [model_cards] Fix incorrect path

---
 .../asafaya/{base-bert-arabic => bert-base-arabic}/README.md      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename model_cards/asafaya/{base-bert-arabic => bert-base-arabic}/README.md (100%)

diff --git a/model_cards/asafaya/base-bert-arabic/README.md b/model_cards/asafaya/bert-base-arabic/README.md
similarity index 100%
rename from model_cards/asafaya/base-bert-arabic/README.md
rename to model_cards/asafaya/bert-base-arabic/README.md

From 5c5af879b6d45c879c987154f66d4ea978925fb2 Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Tue, 3 Mar 2020 15:14:12 -0500
Subject: [PATCH 42/80] [Bart] dont call .forward (#3094)

---
 src/transformers/modeling_bart.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index f832d88575..286d0f0ea4 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -208,7 +208,7 @@ class EncoderLayer(nn.Module):
             encoded output of shape `(seq_len, batch, embed_dim)`
         """
         residual = x
-        x, attn_weights = self.self_attn.forward(
+        x, attn_weights = self.self_attn(
             query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=self.output_attentions,
         )
         x = F.dropout(x, p=self.dropout, training=self.training)
@@ -292,7 +292,7 @@ class BartEncoder(nn.Module):
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 attn = None
             else:
-                x, attn = encoder_layer.forward(x, attention_mask)
+                x, attn = encoder_layer(x, attention_mask)
 
             if self.output_attentions:
                 all_attentions.append(attn)
@@ -356,7 +356,7 @@ class DecoderLayer(nn.Module):
         if layer_state is None:
             layer_state = {}
         # next line mutates layer state
-        x, self_attn_weights = self.self_attn.forward(
+        x, self_attn_weights = self.self_attn(
             query=x, key=y, value=y, layer_state=layer_state, need_weights=need_attn_weights, attn_mask=attention_mask,
         )
         x = F.dropout(x, p=self.dropout, training=self.training)
@@ -365,7 +365,7 @@ class DecoderLayer(nn.Module):
         residual = x
         assert self.encoder_attn.cache_key != self.self_attn.cache_key
 
-        x, encoder_attn_weights = self.encoder_attn.forward(
+        x, encoder_attn_weights = self.encoder_attn(
             query=x,
             key=encoder_hidden_states,  # could be None
             value=encoder_hidden_states,
@@ -449,7 +449,7 @@ class BartDecoder(nn.Module):
                 - attentions
         """
         # embed positions
-        positions = self.embed_positions.forward(input_ids, generation_mode=self.generation_mode)
+        positions = self.embed_positions(input_ids, generation_mode=self.generation_mode)
 
         if self.generation_mode:
             input_ids = input_ids[:, -1:]
@@ -475,7 +475,7 @@ class BartDecoder(nn.Module):
                 continue
 
             layer_state = decoder_cached_states[i] if decoder_cached_states is not None else None
-            x, layer_self_attn, layer_past = decoder_layer.forward(
+            x, layer_self_attn, layer_past = decoder_layer(
                 x,
                 encoder_hidden_states,
                 encoder_padding_mask,
@@ -836,10 +836,10 @@ class BartModel(PretrainedBartModel):
             )
         assert decoder_input_ids is not None
         if encoder_outputs is None:
-            encoder_outputs = self.encoder.forward(input_ids=input_ids, attention_mask=attention_mask)
+            encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
         assert isinstance(encoder_outputs, tuple)
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder.forward(
+        decoder_outputs = self.decoder(
             decoder_input_ids,
             encoder_outputs[0],
             attention_mask,
@@ -925,7 +925,7 @@ class BartForMaskedLM(PretrainedBartModel):
             outputs = model(input_ids=input_ids, lm_labels=input_ids)
             loss, prediction_scores = outputs[:2]
         """
-        outputs = self.model.forward(
+        outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,
@@ -933,7 +933,7 @@ class BartForMaskedLM(PretrainedBartModel):
             decoder_attention_mask=decoder_attention_mask,
             decoder_cached_states=decoder_cached_states,
         )
-        lm_logits = self.lm_head.forward(outputs[0])
+        lm_logits = self.lm_head(outputs[0])
         outputs = (lm_logits,) + outputs[1:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
             loss_fct = nn.CrossEntropyLoss()
@@ -1308,7 +1308,7 @@ class BartForSequenceClassification(PretrainedBartModel):
         loss, logits = outputs[:2]
 
         """
-        outputs = self.model.forward(
+        outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,

From 5b396457e5035a8b16ddee14b205c098598fe6bb Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Tue, 3 Mar 2020 15:29:59 -0500
Subject: [PATCH 43/80] Summarization Examples: add Bart CNN Evaluation (#3082)

* Rename and improve example

* Add test

* slightly faster test

* style

* This breaks remy prolly

* shorter test string

* no slow

* newdir structure

* New tree

* Style

* shorter

* docs

* clean

* Attempt future import

* more import hax
---
 examples/summarization/__init__.py            |  0
 examples/summarization/bart/README.md         | 45 ++++++++++++++
 examples/summarization/bart/__init__.py       |  0
 examples/summarization/bart/evaluate_cnn.py   | 60 +++++++++++++++++++
 .../summarization/bart/test_bart_examples.py  | 28 +++++++++
 .../summarization/{ => bertabs}/README.md     |  2 +-
 examples/summarization/bertabs/__init__.py    |  0
 .../{ => bertabs}/configuration_bertabs.py    |  0
 ...ert_bertabs_original_pytorch_checkpoint.py |  0
 .../{ => bertabs}/modeling_bertabs.py         |  0
 .../{ => bertabs}/requirements.txt            |  0
 .../{ => bertabs}/run_summarization.py        | 11 ++--
 .../{ => bertabs}/test_utils_summarization.py |  8 +--
 .../{ => bertabs}/utils_summarization.py      |  8 +--
 14 files changed, 148 insertions(+), 14 deletions(-)
 create mode 100644 examples/summarization/__init__.py
 create mode 100644 examples/summarization/bart/README.md
 create mode 100644 examples/summarization/bart/__init__.py
 create mode 100644 examples/summarization/bart/evaluate_cnn.py
 create mode 100644 examples/summarization/bart/test_bart_examples.py
 rename examples/summarization/{ => bertabs}/README.md (98%)
 create mode 100644 examples/summarization/bertabs/__init__.py
 rename examples/summarization/{ => bertabs}/configuration_bertabs.py (100%)
 rename examples/summarization/{ => bertabs}/convert_bertabs_original_pytorch_checkpoint.py (100%)
 rename examples/summarization/{ => bertabs}/modeling_bertabs.py (100%)
 rename examples/summarization/{ => bertabs}/requirements.txt (100%)
 rename examples/summarization/{ => bertabs}/run_summarization.py (97%)
 rename examples/summarization/{ => bertabs}/test_utils_summarization.py (91%)
 rename examples/summarization/{ => bertabs}/utils_summarization.py (96%)

diff --git a/examples/summarization/__init__.py b/examples/summarization/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/summarization/bart/README.md b/examples/summarization/bart/README.md
new file mode 100644
index 0000000000..094b0fd9a0
--- /dev/null
+++ b/examples/summarization/bart/README.md
@@ -0,0 +1,45 @@
+### Get the CNN/Daily Mail Data
+To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+this should make a directory called cnn_dm/ with files like `test.source`. 
+To use your own data, copy that files format. Each article to be summarized is on its own line.
+
+### Usage
+To create summaries for each article in dataset, run:
+```bash
+python evaluate_cnn.py <path_to_test.source> cnn_test_summaries.txt
+```
+the default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system.
+
+### Where is the code?
+The core model is in `src/transformers/modeling_bart.py`. This directory only contains examples.
+
+### (WIP) Rouge Scores
+
+### Stanford CoreNLP Setup
+```
+ptb_tokenize () {
+    cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2
+}
+
+sudo apt install openjdk-8-jre-headless
+sudo apt-get install ant
+wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
+unzip stanford-corenlp-full-2018-10-05.zip
+cd stanford-corenlp-full-2018-10-05
+export CLASSPATH=stanford-corenlp-3.9.2.jar:stanford-corenlp-3.9.2-models.jar
+```
+### Rouge Setup
+Install `files2rouge` following the instructions at [here](https://github.com/pltrdy/files2rouge).
+I also needed to run `sudo apt-get install libxml-parser-perl`
+
+```python
+from files2rouge import files2rouge
+from files2rouge import settings
+files2rouge.run(<path_to_tokenized_hypo>,
+                <path_to_tokenized_target>,
+               saveto='rouge_output.txt')
+```
diff --git a/examples/summarization/bart/__init__.py b/examples/summarization/bart/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/summarization/bart/evaluate_cnn.py b/examples/summarization/bart/evaluate_cnn.py
new file mode 100644
index 0000000000..dbcf00b197
--- /dev/null
+++ b/examples/summarization/bart/evaluate_cnn.py
@@ -0,0 +1,60 @@
+import argparse
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from transformers import BartForMaskedLM, BartTokenizer
+
+
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
+    fout = Path(out_file).open("w")
+    model = BartForMaskedLM.from_pretrained("bart-large-cnn", output_past=True,)
+    tokenizer = BartTokenizer.from_pretrained("bart-large")
+    for batch in tqdm(list(chunks(lns, batch_size))):
+        dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True)
+        summaries = model.generate(
+            input_ids=dct["input_ids"].to(device),
+            attention_mask=dct["attention_mask"].to(device),
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=140,
+            min_len=55,
+            no_repeat_ngram_size=3,
+        )
+        dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
+        for hypothesis in dec:
+            fout.write(hypothesis + "\n")
+            fout.flush()
+
+
+def _run_generate():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "source_path", type=str, help="like cnn_dm/test.source",
+    )
+    parser.add_argument(
+        "output_path", type=str, help="where to save summaries",
+    )
+    parser.add_argument(
+        "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",
+    )
+    parser.add_argument(
+        "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
+    )
+    args = parser.parse_args()
+    lns = [" " + x.rstrip() for x in open(args.source_path).readlines()]
+    generate_summaries(lns, args.output_path, batch_size=args.bs, device=args.device)
+
+
+if __name__ == "__main__":
+    _run_generate()
diff --git a/examples/summarization/bart/test_bart_examples.py b/examples/summarization/bart/test_bart_examples.py
new file mode 100644
index 0000000000..faa0725b55
--- /dev/null
+++ b/examples/summarization/bart/test_bart_examples.py
@@ -0,0 +1,28 @@
+import logging
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from .evaluate_cnn import _run_generate
+
+
+articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+class TestBartExamples(unittest.TestCase):
+    def test_bart_cnn_cli(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+        tmp = Path(tempfile.gettempdir()) / "utest_generations.hypo"
+        with tmp.open("w") as f:
+            f.write("\n".join(articles))
+        testargs = ["evaluate_cnn.py", str(tmp), "output.txt"]
+        with patch.object(sys, "argv", testargs):
+            _run_generate()
+            self.assertTrue(Path("output.txt").exists())
diff --git a/examples/summarization/README.md b/examples/summarization/bertabs/README.md
similarity index 98%
rename from examples/summarization/README.md
rename to examples/summarization/bertabs/README.md
index 250c4bcfe8..1307de6b3f 100644
--- a/examples/summarization/README.md
+++ b/examples/summarization/bertabs/README.md
@@ -15,7 +15,7 @@ pip install nltk py-rouge
 cd examples/summarization
 ```
 
-## Reproduce the authors' results on ROUGE
+## Reproduce the authors'  ROUGE score
 
 To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
 
diff --git a/examples/summarization/bertabs/__init__.py b/examples/summarization/bertabs/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/bertabs/configuration_bertabs.py
similarity index 100%
rename from examples/summarization/configuration_bertabs.py
rename to examples/summarization/bertabs/configuration_bertabs.py
diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/bertabs/convert_bertabs_original_pytorch_checkpoint.py
similarity index 100%
rename from examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
rename to examples/summarization/bertabs/convert_bertabs_original_pytorch_checkpoint.py
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/bertabs/modeling_bertabs.py
similarity index 100%
rename from examples/summarization/modeling_bertabs.py
rename to examples/summarization/bertabs/modeling_bertabs.py
diff --git a/examples/summarization/requirements.txt b/examples/summarization/bertabs/requirements.txt
similarity index 100%
rename from examples/summarization/requirements.txt
rename to examples/summarization/bertabs/requirements.txt
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/bertabs/run_summarization.py
similarity index 97%
rename from examples/summarization/run_summarization.py
rename to examples/summarization/bertabs/run_summarization.py
index 4afa97b5a9..5dd8f22729 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/bertabs/run_summarization.py
@@ -11,12 +11,13 @@ from tqdm import tqdm
 
 from modeling_bertabs import BertAbs, build_predictor
 from transformers import BertTokenizer
-from utils_summarization import (
-    SummarizationDataset,
+
+from .utils_summarization import (
+    CNNDMDataset,
     build_mask,
     compute_token_type_ids,
     encode_for_summarization,
-    fit_to_block_size,
+    truncate_or_pad,
 )
 
 
@@ -194,7 +195,7 @@ def build_data_iterator(args, tokenizer):
 
 
 def load_and_cache_examples(args, tokenizer):
-    dataset = SummarizationDataset(args.documents_dir)
+    dataset = CNNDMDataset(args.documents_dir)
     return dataset
 
 
@@ -211,7 +212,7 @@ def collate(data, tokenizer, block_size, device):
 
     encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
     encoded_stories = torch.tensor(
-        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
+        [truncate_or_pad(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
     )
     encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
     encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
diff --git a/examples/summarization/test_utils_summarization.py b/examples/summarization/bertabs/test_utils_summarization.py
similarity index 91%
rename from examples/summarization/test_utils_summarization.py
rename to examples/summarization/bertabs/test_utils_summarization.py
index d562ad04b7..1205543d17 100644
--- a/examples/summarization/test_utils_summarization.py
+++ b/examples/summarization/bertabs/test_utils_summarization.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import torch
 
-from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story
+from .utils_summarization import build_mask, compute_token_type_ids, process_story, truncate_or_pad
 
 
 class SummarizationDataProcessingTest(unittest.TestCase):
@@ -28,19 +28,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         """ Pad the sequence with 0 if the sequence is smaller than the block size."""
         sequence = [1, 2, 3, 4]
         expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_fit_exactly(self):
         """ Do nothing if the sequence is the right size. """
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_too_big(self):
         """ Truncate the sequence if it is too long. """
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
 
     def test_process_story_no_highlights(self):
         """ Processing a story with no highlights returns an empty list for the summary.
diff --git a/examples/summarization/utils_summarization.py b/examples/summarization/bertabs/utils_summarization.py
similarity index 96%
rename from examples/summarization/utils_summarization.py
rename to examples/summarization/bertabs/utils_summarization.py
index 529eeb3efa..a6193339e5 100644
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/bertabs/utils_summarization.py
@@ -10,7 +10,7 @@ from torch.utils.data import Dataset
 # ------------
 
 
-class SummarizationDataset(Dataset):
+class CNNDMDataset(Dataset):
     """ Abstracts the dataset used to train seq2seq models.
 
     The class will process the documents that are located in the specified
@@ -62,11 +62,11 @@ class SummarizationDataset(Dataset):
 def process_story(raw_story):
     """ Extract the story and summary from a story file.
 
-    Attributes:
+    Arguments:
         raw_story (str): content of the story file as an utf-8 encoded string.
 
     Raises:
-        IndexError: If the stoy is empty or contains no highlights.
+        IndexError: If the story is empty or contains no highlights.
     """
     nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
 
@@ -107,7 +107,7 @@ def _add_missing_period(line):
 # --------------------------
 
 
-def fit_to_block_size(sequence, block_size, pad_token_id):
+def truncate_or_pad(sequence, block_size, pad_token_id):
     """ Adapt the source and target sequences' lengths to the block size.
     If the sequence is shorter we append padding token to the right of the sequence.
     """

From f631e01d2c78614416655a85955f326636f69825 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 3 Mar 2020 15:31:31 -0500
Subject: [PATCH 44/80] [ci] Re-run integration ground truth from fairseq

Adopted best practice set by @patrickvonplaten of commenting lines run on fairseq, for easy comparison

also see #3020
---
 ..._original_pytorch_checkpoint_to_pytorch.py |  4 ++-
 tests/test_modeling_roberta.py                | 31 ++++++++++++++-----
 tests/test_modeling_tf_roberta.py             | 10 +++---
 tests/test_modeling_xlm_roberta.py            | 18 +++++------
 4 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index df4c341436..39e4b82019 100644
--- a/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -46,7 +46,9 @@ logger = logging.getLogger(__name__)
 SAMPLE_TEXT = "Hello world! cécé herlolip"
 
 
-def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
+def convert_roberta_checkpoint_to_pytorch(
+    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
     """
     Copy/paste/tweak roberta's weights to our BERT structure.
     """
diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py
index 9ea25a186b..3e58a0b597 100644
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -329,10 +329,15 @@ class RobertaModelIntegrationTest(unittest.TestCase):
         expected_shape = torch.Size((1, 11, 50265))
         self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
         )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+        # roberta.eval()
+        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 
     @slow
     def test_inference_no_head(self):
@@ -341,10 +346,15 @@ class RobertaModelIntegrationTest(unittest.TestCase):
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
         )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+        # roberta.eval()
+        # expected_slice = roberta.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 
     @slow
     def test_inference_classification_head(self):
@@ -354,5 +364,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 3))
         self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3))
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+        # roberta.eval()
+        # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py
index 5ee598b2dc..9bc837c4e3 100644
--- a/tests/test_modeling_tf_roberta.py
+++ b/tests/test_modeling_tf_roberta.py
@@ -222,9 +222,9 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
         self.assertEqual(list(output.numpy().shape), expected_shape)
         # compare the actual values for a slice.
         expected_slice = tf.constant(
-            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
         )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
 
     @slow
     def test_inference_no_head(self):
@@ -234,9 +234,9 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
         output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = tf.constant(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
         )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
 
     @slow
     def test_inference_classification_head(self):
@@ -247,4 +247,4 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
         expected_shape = [1, 3]
         self.assertEqual(list(output.numpy().shape), expected_shape)
         expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3))
+        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-4))
diff --git a/tests/test_modeling_xlm_roberta.py b/tests/test_modeling_xlm_roberta.py
index 8c6bd0069b..3d035f48fc 100644
--- a/tests/test_modeling_xlm_roberta.py
+++ b/tests/test_modeling_xlm_roberta.py
@@ -30,14 +30,13 @@ class XLMRobertaModelIntegrationTest(unittest.TestCase):
     @slow
     def test_xlm_roberta_base(self):
         model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
-        input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze(
-            0
-        )  # The dog is cute and lives in the garden house
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
 
         expected_output_shape = torch.Size((1, 12, 768))  # batch_size, sequence_length, embedding_vector_dim
         expected_output_values_last_dim = torch.tensor(
-            [-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]
-        ).unsqueeze(0)
+            [[-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]]
+        )
         #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
         #  xlmr.eval()
         #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
@@ -50,14 +49,13 @@ class XLMRobertaModelIntegrationTest(unittest.TestCase):
     @slow
     def test_xlm_roberta_large(self):
         model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
-        input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze(
-            0
-        )  # The dog is cute and lives in the garden house
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
 
         expected_output_shape = torch.Size((1, 12, 1024))  # batch_size, sequence_length, embedding_vector_dim
         expected_output_values_last_dim = torch.tensor(
-            [-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]
-        ).unsqueeze(0)
+            [[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]]
+        )
         #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
         #  xlmr.eval()
         #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]

From e9e6efdc452b74947d40a5a2e8af2fc444c63b5b Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Tue, 3 Mar 2020 15:54:29 -0500
Subject: [PATCH 45/80] BartForSequenceClassification: fix num_labels, add test
 (#3110)

---
 src/transformers/modeling_bart.py |  2 +-
 tests/test_modeling_bart.py       | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 286d0f0ea4..21c51f971e 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -1324,7 +1324,7 @@ class BartForSequenceClassification(PretrainedBartModel):
         # Prepend logits
         outputs = (logits,) + outputs[1:]  # Add hidden states and attention if they are here
         if labels is not None:  # prepend loss to output,
-            loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
+            loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 89e41c79ad..559046f66b 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -171,7 +171,7 @@ class BartHeadTests(unittest.TestCase):
 
     vocab_size = 99
 
-    def test_lm_forward(self):
+    def _get_config_and_data(self, output_past=False):
         input_ids = torch.tensor(
             [
                 [71, 82, 18, 33, 46, 91, 2],
@@ -191,9 +191,8 @@ class BartHeadTests(unittest.TestCase):
             dtype=torch.long,
             device=torch_device,
         )
-        batch_size = input_ids.shape[0]
-        decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
 
+        batch_size = input_ids.shape[0]
         config = BartConfig(
             vocab_size=self.vocab_size,
             d_model=24,
@@ -204,14 +203,25 @@ class BartHeadTests(unittest.TestCase):
             encoder_ffn_dim=32,
             decoder_ffn_dim=32,
             max_position_embeddings=48,
+            output_past=output_past,
         )
+        return config, input_ids, batch_size
+
+    def test_sequence_classification_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        labels = _long_tensor([2] * batch_size).to(torch_device)
         model = BartForSequenceClassification(config)
         model.to(torch_device)
-        outputs = model.forward(input_ids=input_ids, decoder_input_ids=input_ids)
-        logits = outputs[0]
+        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
+        logits = outputs[1]
         expected_shape = torch.Size((batch_size, config.num_labels))
         self.assertEqual(logits.shape, expected_shape)
+        loss = outputs[0]
+        self.assertIsInstance(loss.item(), float)
 
+    def test_lm_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data(output_past=False)
+        decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
         lm_model = BartForMaskedLM(config)
         lm_model.to(torch_device)
         loss, logits, enc_features = lm_model.forward(

From b1116fd6736bbd69bd10aacb1de3a0e98985fb2d Mon Sep 17 00:00:00 2001
From: Gunnlaugur Thor Briem <gunnlaugur@gmail.com>
Date: Tue, 3 Mar 2020 23:05:40 +0000
Subject: [PATCH 46/80] fix: passing config as Layer trainable param

Lurking bugs discovered while working on other stuff.
---
 src/transformers/modeling_tf_albert.py | 2 +-
 src/transformers/modeling_tf_openai.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index 64c9dad06a..75f785ede0 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -480,7 +480,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
 
 class TFAlbertMainLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
+        super().__init__(**kwargs)
         self.num_hidden_layers = config.num_hidden_layers
 
         self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index f04104db83..6a97ae7786 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
 
 class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
     def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
+        super().__init__(*inputs, **kwargs)
         self.output_hidden_states = config.output_hidden_states
         self.output_attentions = config.output_attentions
         self.num_hidden_layers = config.n_layer

From 6701fb7859797132a9c82f56ce34bde8ed0a768f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 15:30:51 +0100
Subject: [PATCH 47/80] fix beam_search behavior when sampling (#3106)

* fix beam_search behavior when sampling

* delete print

* make correct style
---
 src/transformers/modeling_utils.py | 40 +++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index e771fd5cc9..3dc0f245c9 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -564,7 +564,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         model.eval()
 
         if output_loading_info:
-            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "error_msgs": error_msgs,
+            }
             return model, loading_info
 
         return model
@@ -941,7 +945,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
         # scores for each sentence in the beam
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
+
+        # Greedy decoding it is made sure that only words of the first beam are considered to avoid sampling the exact same words three times
+        if do_sample is False:
+            beam_scores[:, 1:] = -1e9
         beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
 
         # cache compute states
@@ -967,19 +974,28 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 # Temperature (higher temperature => more likely to sample low probability tokens)
                 if temperature != 1.0:
                     scores = scores / temperature
+
+                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+
                 # Top-p/top-k filtering
-                scores = top_k_top_p_filtering(
-                    scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                _scores = top_k_top_p_filtering(
+                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
                 )  # (batch_size * num_beams, vocab_size)
+
+                # re-organize to group the beam together to sample from all beam_idxs
+                _scores = _scores.contiguous().view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+
                 # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)  # (batch_size * num_beams, 2)
+                next_words = torch.multinomial(
+                    F.softmax(_scores, dim=-1), num_samples=2 * num_beams
+                )  # (batch_size, num_beams * 2)
+
                 # Compute next scores
-                _scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
-                _scores = torch.gather(_scores, -1, next_words)  # (batch_size * num_beams, 2)
-                next_scores = _scores + beam_scores[:, None].expand_as(_scores)  # (batch_size * num_beams, 2)
-                # Match shape of greedy beam search
-                next_words = next_words.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
-                next_scores = next_scores.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
+                next_scores = torch.gather(_scores, -1, next_words)  # (batch_size, num_beams * 2)
+
             else:
                 # do greedy beam search
                 scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
@@ -1026,7 +1042,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                     # add to generated hypotheses if end of sentence or last iteration
                     if eos_token_ids is not None and word_id.item() in eos_token_ids:
                         generated_hyps[batch_idx].add(
-                            input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item()
+                            input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item(),
                         )
                     else:
                         # add next predicted word if it is not eos_token

From 34de670dbe70a9ead31d0692ad9dc726d3ea4edb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 17:25:23 +0100
Subject: [PATCH 48/80] fix sklearn release circle ci [temporary] (#3123)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0b8fd25bef..eb9916d84c 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@ if stale_egg_info.exists():
 extras = {}
 
 extras["mecab"] = ["mecab-python3"]
-extras["sklearn"] = ["scikit-learn"]
+extras["sklearn"] = ["scikit-learn==0.22.1"]
 extras["tf"] = ["tensorflow"]
 extras["tf-cpu"] = ["tensorflow-cpu"]
 extras["torch"] = ["torch"]

From 61fef6e957fd25574d41fbcd6440fcbc04ccaa11 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 00:32:07 +0100
Subject: [PATCH 49/80] added beam_search generation for tf 2.0

---
 src/transformers/modeling_tf_utils.py | 305 ++++++++++++++++++++++++--
 tests/test_modeling_tf_common.py      |  22 +-
 2 files changed, 307 insertions(+), 20 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 43abdd9499..bb1856308a 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -142,7 +142,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
         # # initialize all new embeddings (in particular added tokens)
         # self._init_weights(new_embeddings)
 
-        # # Copy word embeddings from the previous weights
+        # # Copy token embeddings from the previous weights
         # num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
 
@@ -557,6 +557,19 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
         else:
             assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)."
 
+        if do_sample is False:
+            if num_beams == 1:
+                # no_beam_search greedy generation conditions
+                assert (
+                    num_return_sequences == 1
+                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
+
+            else:
+                # beam_search greedy generation conditions
+                assert (
+                    num_beams >= num_return_sequences
+                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
+
         if pad_token_id is None and eos_token_ids is not None:
             logger.warning(
                 "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_ids[0])
@@ -567,7 +580,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
         cur_len = shape_list(input_ids)[1]
         vocab_size = self.config.vocab_size
 
-        if num_return_sequences != 1:
+        if num_return_sequences != 1 and do_sample:
             # Expand input to num return sequences
             input_ids = tf.broadcast_to(tf.expand_dims(input_ids, 1), (batch_size, num_return_sequences, cur_len))
             effective_batch_size = batch_size * num_return_sequences
@@ -588,6 +601,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
                 pad_token_id,
                 eos_token_ids,
                 effective_batch_size,
+                num_return_sequences,
                 length_penalty,
                 num_beams,
                 vocab_size,
@@ -627,19 +641,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
             All returned sequence are generated independantly.
         """
 
-        def _create_next_token_logits_penalties(input_ids, logits):
-            # create logit penalties for already seen input_ids
-            token_penalties = np.ones(shape_list(logits))
-            prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
-            for i, prev_input_id in enumerate(prev_input_ids):
-                logit_penalized = logits[i].numpy()[prev_input_id]
-                # if previous logit score is < 0 then multiply repetition penalty else divide
-                logit_penalized[logit_penalized < 0] = repetition_penalty
-                logit_penalized[logit_penalized > 0] = 1 / repetition_penalty
-                np.put(token_penalties[i], prev_input_id, logit_penalized)
-            return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
-
-        # current position / max lengths / length of generated sentences / unfinished sentences
+        # length of generated sentences / unfinished sentences
         unfinished_sents = tf.ones_like(input_ids[:, 0])
         sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
 
@@ -656,7 +658,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
 
             # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                next_token_logits_penalties = _create_next_token_logits_penalties(input_ids, next_token_logits)
+                next_token_logits_penalties = _create_next_token_logits_penalties(input_ids, next_token_logits, repetition_penalty)
                 next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
 
             if do_sample:
@@ -738,11 +740,228 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
         pad_token_id,
         eos_token_ids,
         batch_size,
+        num_return_sequences,
         length_penalty,
         num_beams,
         vocab_size,
     ):
-        pass
+        """ Generate sequences for each example with beam search.
+        """
+
+        # Expand input to num beams
+        input_ids = tf.broadcast_to(tf.expand_dims(input_ids, 1), (batch_size, num_beams, cur_len))
+        input_ids = tf.reshape(input_ids, (batch_size * num_beams, cur_len))  # (batch_size * num_beams, cur_len)
+
+        # generated hypotheses
+        generated_hyps = [
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)
+        ]
+
+        # scores for each sentence in the beam
+        beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32)
+        beam_scores_end = tf.zeros((batch_size, num_beams - 1), dtype=tf.float32) * 1e-9
+        beam_scores = tf.reshape(tf.concat([beam_scores_begin, beam_scores_end], -1), (batch_size * num_beams,))
+
+        # cache compute states
+        past = None
+
+        # done sentences
+        done = [False for _ in range(batch_size)]
+
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
+            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
+            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
+
+            # if model has past, then set the past variable to speed up decoding
+            if self._do_output_past(outputs):
+                past = outputs[1]
+
+            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
+            if repetition_penalty != 1.0:
+                next_token_logits_penalties = _create_next_token_logits_penalties(input_ids, next_token_logits, repetition_penalty)
+                next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
+
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature != 1.0:
+                    next_token_logits = next_token_logits / temperature
+                # Top-p/top-k filtering
+                next_token_logits = tf_top_k_top_p_filtering(
+                    next_token_logits, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                )  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                next_tokens = tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=2)  # (batch_size * num_beams, vocab_size)
+                # Compute next scores
+                scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
+                _scores = tf.gather(scores, next_tokens, batch_dims=1)  # (batch_size * num_beams, 2)
+                next_scores = _scores + tf.broadcast_to(beam_scores[:, None], (batch_size * num_beams, 2))  # (batch_size * num_beams, 2)
+                # Match shape of greedy beam search
+                next_tokens = tf.reshape(next_tokens, (batch_size, 2 * num_beams))  # (batch_size, 2 * num_beams)
+                next_scores = tf.reshape(next_scores, (batch_size, 2 * num_beams))  # (batch_size, 2 * num_beams)
+            else:
+                # do greedy beam search
+                scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
+                assert shape_list(scores) == [batch_size * num_beams, vocab_size]
+                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
+                next_scores = scores + tf.broadcast_to(beam_scores[:, None], (batch_size * num_beams, vocab_size))  # (batch_size * num_beams, vocab_size)
+
+                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                next_scores = tf.reshape(next_scores, (batch_size, num_beams * vocab_size))  # (batch_size, num_beams * vocab_size)
+                next_scores, next_tokens = tf.math.top_k(next_scores, 2 * num_beams, sorted=True)
+
+            assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams]
+
+            # next batch beam content
+            # list of (batch_size * num_beams) tuple(next hypothesis score, next token, current position in the batch)
+            next_batch_beam = []
+
+            # for each sentence
+            for batch_idx in range(batch_size):
+
+                # if we are done with this sentence
+                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
+                    tf.reduce_max(next_scores[batch_idx]).numpy()
+                )
+                if done[batch_idx]:
+                    assert (
+                        len(generated_hyps[batch_idx]) >= num_beams
+                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
+                    assert (
+                        eos_token_ids is not None and pad_token_id is not None
+                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    continue
+
+                # next sentence beam content
+                next_sent_beam = []
+
+                # next tokens for this sentence
+                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
+
+                    # get beam and token IDs
+                    beam_id = idx // vocab_size
+                    token_id = idx % vocab_size
+
+                    # add to generated hypotheses if end of sentence or last iteration
+                    if eos_token_ids is not None and token_id.numpy() in eos_token_ids:
+                        generated_hyps[batch_idx].add(
+                            tf.identity(input_ids[batch_idx * num_beams + beam_id, :cur_len]), score.numpy()
+                        )
+                    else:
+                        # add next predicted token if it is not eos_token
+                        next_sent_beam.append((score, token_id, batch_idx * num_beams + beam_id))
+
+                    # the beam for next step is full
+                    if len(next_sent_beam) == num_beams:
+                        break
+
+                # update next beam content
+                assert len(next_sent_beam) == num_beams, "Beam should always be full"
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == num_beams * (batch_idx + 1)
+
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * num_beams
+            beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32)
+            beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32)
+            beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32)
+
+            # re-order batch
+            input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx])
+            input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1)
+
+            # re-order internal states
+            if past:
+                past = self._reorder_cache(past, beam_idx)
+
+            # update current length
+            cur_len = cur_len + 1
+
+            # stop when we are done with each sentence
+            if all(done):
+                break
+
+        for batch_idx in range(batch_size):
+            # Add all open beam hypothesis to generated_hyps
+            if not done[batch_idx]:
+                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
+
+                    # get beam and token IDs
+                    beam_id = idx // vocab_size
+                    token_id = idx % vocab_size
+                    generated_hyps[batch_idx].add(
+                        tf.identity(input_ids[batch_idx * num_beams + beam_id, :cur_len]), score.numpy()
+                    )
+
+        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
+        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
+
+        # select the best hypotheses
+        sent_lengths_list = []
+        best = []
+
+        # retrieve best hypotheses
+        for i, hypotheses in enumerate(generated_hyps):
+            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
+            for j in range(output_num_return_sequences_per_batch):
+                best_hyp = sorted_hyps.pop()[1]
+                sent_lengths_list.append(len(best_hyp))
+                best.append(best_hyp)
+        assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format(output_batch_size, len(best))
+
+        sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32)
+
+        # shorter batches are filled with pad_token
+        if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy():
+            assert pad_token_id is not None, "`Pad_token_id` has to be defined"
+            sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length)
+            decoded_list = []
+
+            # fill with hypothesis and eos_token_id if necessary
+            for i, hypo in enumerate(best):
+                padding = tf.ones((sent_max_len - shape_list(hypo)[0],), dtype=tf.int32) * pad_token_id
+                decoded_hypo = tf.concat([hypo, padding], axis=0)
+
+                if sent_lengths[i] < max_length:
+                    decoded_hypo = tf.where(tf.range(max_length) == sent_lengths[i], eos_token_ids[0] * tf.ones((sent_max_len,), dtype=tf.int32), decoded_hypo)
+                decoded_list.append(decoded_hypo)
+            decoded = tf.stack(decoded_list)
+        else:
+            # none of the hypotheses have an eos_token
+            assert (len(hypo) == max_length for hypo in best)
+            decoded = tf.stack(best)
+
+        return decoded
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = []
+        for layer_past in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` and `mems` is at 2nd position
+            reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[i], 0)) for i in beam_idx]
+            # TODO: check whether it is an error that TF past.shape != Torch past.shape
+            reordered_layer_past = tf.concat(reordered_layer_past, axis=0)
+            # check that shape matches
+            assert shape_list(reordered_layer_past) == shape_list(layer_past)
+            reordered_past.append(reordered_layer_past)
+        past = tuple(reordered_past)
+        return past
+
+
+def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
+    # create logit penalties for already seen input_ids
+    token_penalties = np.ones(shape_list(logits))
+    prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
+    for i, prev_input_id in enumerate(prev_input_ids):
+        logit_penalized = logits[i].numpy()[prev_input_id]
+        # if previous logit score is < 0 then multiply repetition penalty else divide
+        logit_penalized[logit_penalized < 0] = repetition_penalty
+        logit_penalized[logit_penalized > 0] = 1 / repetition_penalty
+        np.put(token_penalties[i], prev_input_id, logit_penalized)
+    return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
 
 
 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
@@ -811,6 +1030,56 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
     return tf.where(indices, value_tensor, tensor)
 
 
+class BeamHypotheses(object):
+    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp, sum_logprobs):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs, cur_len=None):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            if cur_len is None:
+                cur_len = self.max_length
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
 class TFConv1D(tf.keras.layers.Layer):
     def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
         """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
@@ -849,7 +1118,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
         self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
-        """Build shared word embedding layer
+        """Build shared token embedding layer
         Shared weights logic adapted from
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 8cd53dfe19..a6d2e8e32f 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -381,7 +381,6 @@ class TFModelTesterMixin:
         )  # TODO (PVP): ugly workaround to make code work for t5 for the moment - has to changed when t5 is fixed.
 
         for model_class in self.all_generative_model_classes:
-            # TODO (PVP): add beam search tests when beam search is implemented
             model = model_class(config)
 
             if config.bos_token_id is None:
@@ -389,15 +388,34 @@ class TFModelTesterMixin:
                     model.generate(max_length=5)
                 # batch_size = 1
                 self._check_generated_tokens(model.generate(input_ids))
+                # batch_size = 1, num_beams > 1
+                self._check_generated_tokens(model.generate(input_ids, num_beams=3))
             else:
                 # batch_size = 1
                 self._check_generated_tokens(model.generate(max_length=5))
                 # batch_size = 1, num_beams > 1
+                self._check_generated_tokens(model.generate(max_length=5, num_beams=3))
+
+            with self.assertRaises(AssertionError):
+                # generating multiple sequences when greedy no beam generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
             # batch_size > 1, sample
             self._check_generated_tokens(model.generate(input_ids, num_return_sequences=3))
             # batch_size > 1, greedy
-            self._check_generated_tokens(model.generate(input_ids, do_sample=False, num_return_sequences=3))
+            self._check_generated_tokens(model.generate(input_ids, do_sample=False))
+
+            # batch_size > 1, num_beams > 1, sample
+            self._check_generated_tokens(model.generate(input_ids, num_beams=3, num_return_sequences=3,))
+            # batch_size > 1, num_beams > 1, greedy
+            self._check_generated_tokens(
+                model.generate(input_ids, do_sample=False, num_beams=3, num_return_sequences=3)
+            )
 
     def _check_generated_tokens(self, output_ids):
         for token_id in output_ids[0].numpy().tolist():

From 2529b2d37ee234c7b100f3896ce1688fc60580bd Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 00:41:05 +0100
Subject: [PATCH 50/80] set redorder past sort dimension to its default

---
 src/transformers/modeling_tf_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bb1856308a..e3083b6d20 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -941,9 +941,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
         for layer_past in past:
             # get the correct batch idx from layer past batch dim
             # batch dim of `past` and `mems` is at 2nd position
-            reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[i], 0)) for i in beam_idx]
+            reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[:, i], 1)) for i in beam_idx]
             # TODO: check whether it is an error that TF past.shape != Torch past.shape
-            reordered_layer_past = tf.concat(reordered_layer_past, axis=0)
+            reordered_layer_past = tf.concat(reordered_layer_past, axis=1)
             # check that shape matches
             assert shape_list(reordered_layer_past) == shape_list(layer_past)
             reordered_past.append(reordered_layer_past)

From c4c4c9998a2e335706dd7638bbd90513234077f3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 11:09:45 +0100
Subject: [PATCH 51/80] make GPT2 and CTRL shape consistent between torch and
 TF

---
 src/transformers/modeling_tf_ctrl.py  |  4 +--
 src/transformers/modeling_tf_gpt2.py  |  4 +--
 src/transformers/modeling_tf_utils.py | 35 ++++++++++++++++++++-------
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py
index 335421979c..8a049bbce9 100644
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -104,10 +104,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         k = self.split_into_heads(k, batch_size)
         v = self.split_into_heads(v, batch_size)
         if layer_past is not None:
-            past_key, past_value = tf.unstack(layer_past, axis=1)
+            past_key, past_value = tf.unstack(layer_past, axis=0)
             k = tf.concat((past_key, k), axis=-2)
             v = tf.concat((past_value, v), axis=-2)
-        present = tf.stack((k, v), axis=1)
+        present = tf.stack((k, v), axis=0)
 
         output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
         scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index 7e9b102b6d..3b79d58949 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -139,10 +139,10 @@ class TFAttention(tf.keras.layers.Layer):
         key = self.split_heads(key)
         value = self.split_heads(value)
         if layer_past is not None:
-            past_key, past_value = tf.unstack(layer_past, axis=1)
+            past_key, past_value = tf.unstack(layer_past, axis=0)
             key = tf.concat([past_key, key], axis=-2)
             value = tf.concat([past_value, value], axis=-2)
-        present = tf.stack([key, value], axis=1)
+        present = tf.stack([key, value], axis=0)
 
         attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
         a = attn_outputs[0]
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index e3083b6d20..1dfeecdd8e 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -658,7 +658,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
 
             # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                next_token_logits_penalties = _create_next_token_logits_penalties(input_ids, next_token_logits, repetition_penalty)
+                next_token_logits_penalties = _create_next_token_logits_penalties(
+                    input_ids, next_token_logits, repetition_penalty
+                )
                 next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
 
             if do_sample:
@@ -779,7 +781,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
 
             # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                next_token_logits_penalties = _create_next_token_logits_penalties(input_ids, next_token_logits, repetition_penalty)
+                next_token_logits_penalties = _create_next_token_logits_penalties(
+                    input_ids, next_token_logits, repetition_penalty
+                )
                 next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
 
             if do_sample:
@@ -791,11 +795,15 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
                     next_token_logits, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
                 )  # (batch_size * num_beams, vocab_size)
                 # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_tokens = tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=2)  # (batch_size * num_beams, vocab_size)
+                next_tokens = tf.random.categorical(
+                    next_token_logits, dtype=tf.int32, num_samples=2
+                )  # (batch_size * num_beams, vocab_size)
                 # Compute next scores
                 scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
                 _scores = tf.gather(scores, next_tokens, batch_dims=1)  # (batch_size * num_beams, 2)
-                next_scores = _scores + tf.broadcast_to(beam_scores[:, None], (batch_size * num_beams, 2))  # (batch_size * num_beams, 2)
+                next_scores = _scores + tf.broadcast_to(
+                    beam_scores[:, None], (batch_size * num_beams, 2)
+                )  # (batch_size * num_beams, 2)
                 # Match shape of greedy beam search
                 next_tokens = tf.reshape(next_tokens, (batch_size, 2 * num_beams))  # (batch_size, 2 * num_beams)
                 next_scores = tf.reshape(next_scores, (batch_size, 2 * num_beams))  # (batch_size, 2 * num_beams)
@@ -804,10 +812,14 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
                 scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
                 assert shape_list(scores) == [batch_size * num_beams, vocab_size]
                 # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                next_scores = scores + tf.broadcast_to(beam_scores[:, None], (batch_size * num_beams, vocab_size))  # (batch_size * num_beams, vocab_size)
+                next_scores = scores + tf.broadcast_to(
+                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
+                )  # (batch_size * num_beams, vocab_size)
 
                 # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                next_scores = tf.reshape(next_scores, (batch_size, num_beams * vocab_size))  # (batch_size, num_beams * vocab_size)
+                next_scores = tf.reshape(
+                    next_scores, (batch_size, num_beams * vocab_size)
+                )  # (batch_size, num_beams * vocab_size)
                 next_scores, next_tokens = tf.math.top_k(next_scores, 2 * num_beams, sorted=True)
 
             assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams]
@@ -909,7 +921,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
                 best_hyp = sorted_hyps.pop()[1]
                 sent_lengths_list.append(len(best_hyp))
                 best.append(best_hyp)
-        assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format(output_batch_size, len(best))
+        assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format(
+            output_batch_size, len(best)
+        )
 
         sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32)
 
@@ -925,7 +939,11 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
                 decoded_hypo = tf.concat([hypo, padding], axis=0)
 
                 if sent_lengths[i] < max_length:
-                    decoded_hypo = tf.where(tf.range(max_length) == sent_lengths[i], eos_token_ids[0] * tf.ones((sent_max_len,), dtype=tf.int32), decoded_hypo)
+                    decoded_hypo = tf.where(
+                        tf.range(max_length) == sent_lengths[i],
+                        eos_token_ids[0] * tf.ones((sent_max_len,), dtype=tf.int32),
+                        decoded_hypo,
+                    )
                 decoded_list.append(decoded_hypo)
             decoded = tf.stack(decoded_list)
         else:
@@ -942,7 +960,6 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
             # get the correct batch idx from layer past batch dim
             # batch dim of `past` and `mems` is at 2nd position
             reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[:, i], 1)) for i in beam_idx]
-            # TODO: check whether it is an error that TF past.shape != Torch past.shape
             reordered_layer_past = tf.concat(reordered_layer_past, axis=1)
             # check that shape matches
             assert shape_list(reordered_layer_past) == shape_list(layer_past)

From 7a89a3e4935cdd7b46765c5737665b10bfed1e28 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 12:02:57 +0100
Subject: [PATCH 52/80] correct beam search sampling

---
 src/transformers/modeling_tf_utils.py | 36 ++++++++++++++++-----------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 1dfeecdd8e..68151d93c5 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -760,9 +760,14 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
         ]
 
         # scores for each sentence in the beam
-        beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32)
-        beam_scores_end = tf.zeros((batch_size, num_beams - 1), dtype=tf.float32) * 1e-9
-        beam_scores = tf.reshape(tf.concat([beam_scores_begin, beam_scores_end], -1), (batch_size * num_beams,))
+        if do_sample is False:
+            beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32)
+            beam_scores_end = tf.zeros((batch_size, num_beams - 1), dtype=tf.float32) * 1e-9
+            beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1)
+        else:
+            beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32)
+
+        beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,))
 
         # cache compute states
         past = None
@@ -790,23 +795,24 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
                 # Temperature (higher temperature => more likely to sample low probability tokens)
                 if temperature != 1.0:
                     next_token_logits = next_token_logits / temperature
+
+                scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
+                _scores = scores + tf.broadcast_to(
+                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
+                )  # (batch_size * num_beams, vocab_size)
+
                 # Top-p/top-k filtering
-                next_token_logits = tf_top_k_top_p_filtering(
-                    next_token_logits, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                _scores = tf_top_k_top_p_filtering(
+                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
                 )  # (batch_size * num_beams, vocab_size)
                 # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size))
+
                 next_tokens = tf.random.categorical(
-                    next_token_logits, dtype=tf.int32, num_samples=2
-                )  # (batch_size * num_beams, vocab_size)
+                    _scores, dtype=tf.int32, num_samples=2 * num_beams
+                )  # (batch_size, 2 * num_beams)
                 # Compute next scores
-                scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
-                _scores = tf.gather(scores, next_tokens, batch_dims=1)  # (batch_size * num_beams, 2)
-                next_scores = _scores + tf.broadcast_to(
-                    beam_scores[:, None], (batch_size * num_beams, 2)
-                )  # (batch_size * num_beams, 2)
-                # Match shape of greedy beam search
-                next_tokens = tf.reshape(next_tokens, (batch_size, 2 * num_beams))  # (batch_size, 2 * num_beams)
-                next_scores = tf.reshape(next_scores, (batch_size, 2 * num_beams))  # (batch_size, 2 * num_beams)
+                next_scores = tf.gather(_scores, next_tokens, batch_dims=1)  # (batch_size, 2 * num_beams)
             else:
                 # do greedy beam search
                 scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)

From 71c87119708b6466574eb2cb11097c6b38b86fc9 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Wed, 4 Mar 2020 16:45:57 +0000
Subject: [PATCH 53/80] Adding Docker images for transformers + notebooks
 (#3051)

* Added transformers-pytorch-cpu and gpu Docker images

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added automatic jupyter launch for Docker image.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Move image from alpine to Ubuntu to align with NVidia container images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added TRANSFORMERS_VERSION argument to Dockerfile.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added Pytorch-GPU based Docker image

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added Tensorflow images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Use python 3.7 as Tensorflow doesnt provide 3.8 compatible wheel.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Remove double FROM instructions on transformers-pytorch-cpu image.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added transformers-tensorflow-gpu Docker image.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* use the correct ubuntu version for tensorflow-gpu

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added pipelines example notebook

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added transformers-cpu and transformers-gpu (including both PyTorch and TensorFlow) images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Docker images doesnt start jupyter notebook by default.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Tokenizers notebook

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Update images links

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Update Docker images to python 3.7.6 and transformers 2.5.1

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added 02-transformers notebook.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Trying to realign 02-transformers notebook ?

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added Transformer image schema

* Some tweaks on tokenizers notebook

* Removed old notebooks.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Attempt to provide table of content for each notebooks

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Second attempt.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Reintroduce transformer image.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Keep trying

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* It's going to fly !

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Remaining of the Table of Content

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Fix inlined elements for the table of content

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Removed anaconda dependencies for Docker images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Removing notebooks ToC

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added LABEL to each docker image.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Removed old Dockerfile

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Directly use the context and include transformers from here.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Reduce overall size of compiled Docker images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Install jupyter by default and use CMD for easier launching of the images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Reduce number of layers in the images.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added README.md for notebooks.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Fix notebooks link in README

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Fix some wording issues.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Added blog notebooks too.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Addressing spelling errors in review comments.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

Co-authored-by: MOI Anthony <xn1t0x@gmail.com>
---
 docker/Dockerfile                             |    7 -
 docker/transformers-cpu/Dockerfile            |   26 +
 docker/transformers-gpu/Dockerfile            |   26 +
 docker/transformers-pytorch-cpu/Dockerfile    |   25 +
 docker/transformers-pytorch-gpu/Dockerfile    |   25 +
 docker/transformers-tensorflow-cpu/Dockerfile |   25 +
 docker/transformers-tensorflow-gpu/Dockerfile |   25 +
 notebooks/01-training-tokenizers.ipynb        |  366 ++
 notebooks/02-transformers.ipynb               |  502 ++
 notebooks/03-pipelines.ipynb                  |  594 ++
 notebooks/Comparing-PT-and-TF-models.ipynb    | 1630 ------
 .../Comparing-TF-and-PT-models-MLM-NSP.ipynb  | 4815 -----------------
 .../Comparing-TF-and-PT-models-SQuAD.ipynb    | 1644 ------
 notebooks/Comparing-TF-and-PT-models.ipynb    | 1318 -----
 notebooks/README.md                           |   17 +
 15 files changed, 1631 insertions(+), 9414 deletions(-)
 delete mode 100644 docker/Dockerfile
 create mode 100644 docker/transformers-cpu/Dockerfile
 create mode 100644 docker/transformers-gpu/Dockerfile
 create mode 100644 docker/transformers-pytorch-cpu/Dockerfile
 create mode 100644 docker/transformers-pytorch-gpu/Dockerfile
 create mode 100644 docker/transformers-tensorflow-cpu/Dockerfile
 create mode 100644 docker/transformers-tensorflow-gpu/Dockerfile
 create mode 100644 notebooks/01-training-tokenizers.ipynb
 create mode 100644 notebooks/02-transformers.ipynb
 create mode 100644 notebooks/03-pipelines.ipynb
 delete mode 100644 notebooks/Comparing-PT-and-TF-models.ipynb
 delete mode 100644 notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
 delete mode 100644 notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
 delete mode 100644 notebooks/Comparing-TF-and-PT-models.ipynb
 create mode 100644 notebooks/README.md

diff --git a/docker/Dockerfile b/docker/Dockerfile
deleted file mode 100644
index fed834ff88..0000000000
--- a/docker/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM pytorch/pytorch:latest
-
-RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-
-RUN pip install transformers
-
-WORKDIR /workspace
\ No newline at end of file
diff --git a/docker/transformers-cpu/Dockerfile b/docker/transformers-cpu/Dockerfile
new file mode 100644
index 0000000000..0d22039a48
--- /dev/null
+++ b/docker/transformers-cpu/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow-cpu \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/transformers-gpu/Dockerfile b/docker/transformers-gpu/Dockerfile
new file mode 100644
index 0000000000..6d68d2e480
--- /dev/null
+++ b/docker/transformers-gpu/Dockerfile
@@ -0,0 +1,26 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/transformers-pytorch-cpu/Dockerfile b/docker/transformers-pytorch-cpu/Dockerfile
new file mode 100644
index 0000000000..d1759d650b
--- /dev/null
+++ b/docker/transformers-pytorch-cpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
new file mode 100644
index 0000000000..4beff57dc9
--- /dev/null
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/transformers-tensorflow-cpu/Dockerfile b/docker/transformers-tensorflow-cpu/Dockerfile
new file mode 100644
index 0000000000..e4af2b84bd
--- /dev/null
+++ b/docker/transformers-tensorflow-cpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    tensorflow-cpu
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
new file mode 100644
index 0000000000..3277434c9f
--- /dev/null
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    tensorflow
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/notebooks/01-training-tokenizers.ipynb b/notebooks/01-training-tokenizers.ipynb
new file mode 100644
index 0000000000..554d25d3ff
--- /dev/null
+++ b/notebooks/01-training-tokenizers.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Tokenization doesn't have to be slow !\n",
+    "\n",
+    "### Introduction\n",
+    "\n",
+    "Before going deep into any Machine Learning or Deep Learning Natural Language Processing models, every practitioner\n",
+    "should find a way to map raw input strings to a representation understandable by a trainable model.\n",
+    "\n",
+    "One very simple approach would be to split inputs over every space and assign an identifier to each word. This approach\n",
+    "would look similar to the code below in python\n",
+    "\n",
+    "```python\n",
+    "s = \"very long corpus...\"\n",
+    "words = s.split(\" \")  # Split over space\n",
+    "vocabulary = dict(enumerate(set(words)))  # Map storing the word to it's corresponding id\n",
+    "```\n",
+    "\n",
+    "This approach might work well if your vocabulary remains small as it would store every word (or **token**) present in your original\n",
+    "input. Moreover, word variations like \"cat\" and \"cats\" would not share the same identifiers even if their meaning is \n",
+    "quite close.\n",
+    "\n",
+    "![tokenization_simple](https://cdn.analyticsvidhya.com/wp-content/uploads/2019/11/tokenization.png)\n",
+    "\n",
+    "### Subtoken Tokenization\n",
+    "\n",
+    "To overcome the issues described above, recent works have been done on tokenization, leveraging \"subtoken\" tokenization.\n",
+    "**Subtokens** extends the previous splitting strategy to furthermore explode a word into grammatically logicial sub-components learned\n",
+    "from the data.\n",
+    "\n",
+    "Taking our previous example of the words __cat__ and __cats__, a sub-tokenization of the word __cats__ would be [cat, ##s]. Where the prefix _\"##\"_ indicates a subtoken of the initial input. \n",
+    "Such training algorithms might extract sub-tokens such as _\"##ing\"_, _\"##ed\"_ over English corpus.\n",
+    "\n",
+    "As you might think of, this kind of sub-tokens construction leveraging compositions of _\"pieces\"_ overall reduces the size\n",
+    "of the vocabulary you have to carry to train a Machine Learning model. On the other side, as one token might be exploded\n",
+    "into multiple subtokens, the input of your model might increase and become an issue on model with non-linear complexity over the input sequence's length. \n",
+    " \n",
+    "![subtokenization](https://nlp.fast.ai/images/multifit_vocabularies.png)\n",
+    " \n",
+    "Among all the tokenization algorithms, we can highlight a few subtokens algorithms used in Transformers-based SoTA models : \n",
+    "\n",
+    "- [Byte Pair Encoding (BPE) - Neural Machine Translation of Rare Words with Subword Units (Sennrich et al., 2015)](https://arxiv.org/abs/1508.07909)\n",
+    "- [Word Piece - Japanese and Korean voice search (Schuster, M., and Nakajima, K., 2015)](https://research.google/pubs/pub37842/)\n",
+    "- [Unigram Language Model - Subword Regularization: Improving Neural Network Translation Models with Multiple Subword Candidates (Kudo, T., 2018)](https://arxiv.org/abs/1804.10959)\n",
+    "- [Sentence Piece - A simple and language independent subword tokenizer and detokenizer for Neural Text Processing (Taku Kudo and John Richardson, 2018)](https://arxiv.org/abs/1808.06226)\n",
+    "\n",
+    "Going through all of them is out of the scope of this notebook, so we will just highlight how you can use them.\n",
+    "\n",
+    "### @huggingface/tokenizers library \n",
+    "Along with the transformers library, we @huggingface provide a blazing fast tokenization library\n",
+    "able to train, tokenize and decode dozens of Gb/s of text on a common multi-core machine.\n",
+    "\n",
+    "The library is written in Rust allowing us to take full advantage of multi-core parallel computations in a native and memory-aware way, on-top of which \n",
+    "we provide bindings for Python and NodeJS (more bindings may be added in the future). \n",
+    "\n",
+    "We designed the library so that it provides all the required blocks to create end-to-end tokenizers in an interchangeable way. In that sense, we provide\n",
+    "these various components: \n",
+    "\n",
+    "- **Normalizer**: Executes all the initial transformations over the initial input string. For example when you need to\n",
+    "lowercase some text, maybe strip it, or even apply one of the common unicode normalization process, you will add a Normalizer. \n",
+    "- **PreTokenizer**: In charge of splitting the initial input string. That's the component that decides where and how to\n",
+    "pre-segment the origin string. The simplest example would be like we saw before, to simply split on spaces.\n",
+    "- **Model**: Handles all the sub-token discovery and generation, this part is trainable and really dependant\n",
+    " of your input data.\n",
+    "- **Post-Processor**: Provides advanced construction features to be compatible with some of the Transformers-based SoTA\n",
+    "models. For instance, for BERT it would wrap the tokenized sentence around [CLS] and [SEP] tokens.\n",
+    "- **Decoder**: In charge of mapping back a tokenized input to the original string. The decoder is usually chosen according\n",
+    "to the `PreTokenizer` we used previously.\n",
+    "- **Trainer**: Provides training capabilities to each model.\n",
+    "\n",
+    "For each of the components above we provide multiple implementations:\n",
+    "\n",
+    "- **Normalizer**: Lowercase, Unicode (NFD, NFKD, NFC, NFKC), Bert, Strip, ...\n",
+    "- **PreTokenizer**: ByteLevel, WhitespaceSplit, CharDelimiterSplit, Metaspace, ...\n",
+    "- **Model**: WordLevel, BPE, WordPiece\n",
+    "- **Post-Processor**: BertProcessor, ...\n",
+    "- **Decoder**: WordLevel, BPE, WordPiece, ...\n",
+    "\n",
+    "All of these building blocks can be combined to create working tokenization pipelines. \n",
+    "In the next section we will go over our first pipeline."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Alright, now we are ready to implement our first tokenization pipeline through `tokenizers`. \n",
+    "\n",
+    "For this, we will train a Byte-Pair Encoding (BPE) tokenizer on a quite small input for the purpose of this notebook.\n",
+    "We will work with [the file from peter Norving](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwjYp9Ppru_nAhUBzIUKHfbUAG8QFjAAegQIBhAB&url=https%3A%2F%2Fnorvig.com%2Fbig.txt&usg=AOvVaw2ed9iwhcP1RKUiEROs15Dz).\n",
+    "This file contains around 130.000 lines of raw text that will be processed by the library to generate a working tokenizer."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "BIG_FILE_URL = 'https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt'\n",
+    "\n",
+    "# Let's download the file and save it somewhere\n",
+    "from requests import get\n",
+    "with open('big.txt', 'wb') as big_f:\n",
+    "    response = get(BIG_FILE_URL, )\n",
+    "    \n",
+    "    if response.status_code == 200:\n",
+    "        big_f.write(response.content)\n",
+    "    else:\n",
+    "        print(\"Unable to get the file: {}\".format(response.reason))\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    " \n",
+    "Now that we have our training data we need to create the overall pipeline for the tokenizer\n",
+    " "
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "outputs": [],
+   "source": [
+    "# For the user's convenience `tokenizers` provides some very high-level classes encapsulating\n",
+    "# the overall pipeline for various well-known tokenization algorithm. \n",
+    "# Everything described below can be replaced by the ByteLevelBPETokenizer class. \n",
+    "\n",
+    "from tokenizers import Tokenizer\n",
+    "from tokenizers.decoders import ByteLevel as ByteLevelDecoder\n",
+    "from tokenizers.models import BPE\n",
+    "from tokenizers.normalizers import Lowercase, NFKC, Sequence\n",
+    "from tokenizers.pre_tokenizers import ByteLevel\n",
+    "\n",
+    "# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n",
+    "tokenizer = Tokenizer(BPE.empty())\n",
+    "\n",
+    "# Then we enable lower-casing and unicode-normalization\n",
+    "# The Sequence normalizer allows us to combine multiple Normalizer, that will be\n",
+    "# executed in sequence.\n",
+    "tokenizer.normalizer = Sequence([\n",
+    "    NFKC(),\n",
+    "    Lowercase()\n",
+    "])\n",
+    "\n",
+    "# Out tokenizer also needs a pre-tokenizer responsible for converting the input to a ByteLevel representation.\n",
+    "tokenizer.pre_tokenizer = ByteLevel()\n",
+    "\n",
+    "# And finally, let's plug a decoder so we can recover from a tokenized input to the original one\n",
+    "tokenizer.decoder = ByteLevelDecoder()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The overall pipeline is now ready to be trained on the corpus we downloaded earlier in this notebook."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "Trained vocab size: 25000\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "from tokenizers.trainers import BpeTrainer\n",
+    "\n",
+    "# We initialize our trainer, giving him the details about the vocabulary we want to generate\n",
+    "trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet())\n",
+    "tokenizer.train(trainer, [\"big.txt\"])\n",
+    "\n",
+    "print(\"Trained vocab size: {}\".format(tokenizer.get_vocab_size()))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Et voilà ! You trained your very first tokenizer from scratch using `tokenizers`. Of course, this \n",
+    "covers only the basics, and you may want to have a look at the `add_special_tokens` or `special_tokens` parameters\n",
+    "on the `Trainer` class, but the overall process should be very similar.\n",
+    "\n",
+    "We can save the content of the model to reuse it later."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['./vocab.json', './merges.txt']"
+     },
+     "metadata": {},
+     "output_type": "execute_result",
+     "execution_count": 12
+    }
+   ],
+   "source": [
+    "# You will see the generated files in the output.\n",
+    "tokenizer.model.save('.')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Now, let load the trained model and start using out newly trained tokenizer"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "Encoded string: ['Ġthis', 'Ġis', 'Ġa', 'Ġsimple', 'Ġin', 'put', 'Ġto', 'Ġbe', 'Ġtoken', 'ized']\n",
+      "Decoded string:  this is a simple input to be tokenized\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "# Let's tokenizer a simple input\n",
+    "tokenizer.model = BPE.from_files('vocab.json', 'merges.txt')\n",
+    "encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n",
+    "\n",
+    "print(\"Encoded string: {}\".format(encoding.tokens))\n",
+    "\n",
+    "decoded = tokenizer.decode(encoding.ids)\n",
+    "print(\"Decoded string: {}\".format(decoded))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The Encoding structure exposes multiple properties which are useful when working with transformers models\n",
+    "\n",
+    "- normalized_str: The input string after normalization (lower-casing, unicode, stripping, etc.)\n",
+    "- original_str: The input string as it was provided\n",
+    "- tokens: The generated tokens with their string representation\n",
+    "- input_ids: The generated tokens with their integer representation\n",
+    "- attention_mask: If your input has been padded by the tokenizer, then this would be a vector of 1 for any non padded token and 0 for padded ones.\n",
+    "- special_token_mask: If your input contains special tokens such as [CLS], [SEP], [MASK], [PAD], then this would be a vector with 1 in places where a special token has been added.\n",
+    "- type_ids: If your was made of multiple \"parts\" such as (question, context), then this would be a vector with for each token the segment it belongs to.\n",
+    "- overflowing: If your has been truncated into multiple subparts because of a length limit (for BERT for example the sequence length is limited to 512), this will contain all the remaining overflowing parts."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/notebooks/02-transformers.ipynb b/notebooks/02-transformers.ipynb
new file mode 100644
index 0000000000..fcd9db55cd
--- /dev/null
+++ b/notebooks/02-transformers.ipynb
@@ -0,0 +1,502 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Introduction\n",
+    "The transformers library is an open-source, community-based repository to train, use and share models based on \n",
+    "the Transformer architecture [(Vaswani & al., 2017)](https://arxiv.org/abs/1706.03762) such as Bert [(Devlin & al., 2018)](https://arxiv.org/abs/1810.04805),\n",
+    "Roberta [(Liu & al., 2019)](https://arxiv.org/abs/1907.11692), GPT2 [(Radford & al., 2019)](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf),\n",
+    "XLNet [(Yang & al., 2019)](https://arxiv.org/abs/1906.08237), etc. \n",
+    "\n",
+    "Along with the models, the library contains multiple variations of each of them for a large variety of \n",
+    "downstream-tasks like **Named Entity Recognition (NER)**, **Sentiment Analysis**, \n",
+    "**Language Modeling**, **Question Answering** and so on.\n",
+    "\n",
+    "## Before Transformer\n",
+    "\n",
+    "Back to 2017, most of the people using Neural Networks when working on Natural Language Processing were relying on \n",
+    "sequential processing of the input through [Recurrent Neural Network (RNN)](https://en.wikipedia.org/wiki/Recurrent_neural_network).\n",
+    "\n",
+    "![rnn](http://colah.github.io/posts/2015-09-NN-Types-FP/img/RNN-general.png)   \n",
+    "\n",
+    "RNNs were performing well on large variety of tasks involving sequential dependency over the input sequence. \n",
+    "However, this sequentially-dependent process had issues modeling very long range dependencies and \n",
+    "was not well suited for the kind of hardware we're currently leveraging due to bad parallelization capabilities. \n",
+    "\n",
+    "Some extensions were provided by the academic community, such as Bidirectional RNN ([Schuster & Paliwal., 1997](https://www.researchgate.net/publication/3316656_Bidirectional_recurrent_neural_networks), [Graves & al., 2005](https://mediatum.ub.tum.de/doc/1290195/file.pdf)), \n",
+    "which can be seen as a concatenation of two sequential process, on going forward, the other one going backward over the sequence input.\n",
+    "\n",
+    "![birnn](https://miro.medium.com/max/764/1*6QnPUSv_t9BY9Fv8_aLb-Q.png)\n",
+    "\n",
+    "\n",
+    "And also, the Attention mechanism, which introduced a good improvement over \"raw\" RNNs by giving \n",
+    "a learned, weighted-importance to each element in the sequence, allowing the model to focus on important elements.\n",
+    "\n",
+    "![attention_rnn](https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2017/08/Example-of-Attention.png)  \n",
+    "\n",
+    "## Then comes the Transformer  \n",
+    "\n",
+    "The Transformers era originally started from the work of [(Vaswani & al., 2017)](https://arxiv.org/abs/1706.03762) who\n",
+    "demonstrated its superiority over [Recurrent Neural Network (RNN)](https://en.wikipedia.org/wiki/Recurrent_neural_network)\n",
+    "on translation tasks but it quickly extended to almost all the tasks RNNs were State-of-the-Art at that time.\n",
+    "\n",
+    "One advantage of Transformer over its RNN counterpart was its non sequential attention model. Remember, the RNNs had to\n",
+    "iterate over each element of the input sequence one-by-one and carry an \"updatable-state\" between each hop. With Transformer\n",
+    "the, the model is able to look at every position in the sequence, at the same time, in one operation.\n",
+    "\n",
+    "For a deep-dive into the Transformer architecture, [The Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html#encoder-and-decoder-stacks) \n",
+    "will drive you along all the details of the paper.\n",
+    "\n",
+    "![transformer-encoder-decoder](https://nlp.seas.harvard.edu/images/the-annotated-transformer_14_0.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Getting started with transformers\n",
+    "\n",
+    "For the rest of this notebook, we will use a BERT model, as it's the most simple and there are plenty of content about it\n",
+    "over the internet, it will be easy to dig more over this architecture if you want to.\n",
+    "\n",
+    "The transformers library allows you to benefits from large, pretrained language models without requiring a huge and costly computational\n",
+    "infrastructure. Most of the State-of-the-Art models are provided directly by their author and made available in the library \n",
+    "in PyTorch and TensorFlow in a transparent and interchangeable way. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch.autograd.grad_mode.set_grad_enabled at 0x1af62fd450>"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModel, AutoTokenizer, BertTokenizer\n",
+    "\n",
+    "torch.set_grad_enabled(False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Store the model we want to use\n",
+    "MODEL_NAME = \"bert-base-cased\"\n",
+    "\n",
+    "# We need to create the model and tokenizer\n",
+    "model = AutoModel.from_pretrained(MODEL_NAME)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "With only the above two lines of code, you're ready to use a BERT pre-trained model. \n",
+    "The tokenizers will allow us to map a raw textual input to a sequence of integers representing our textual input\n",
+    "in a way the model can manipulate."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokens: ['[CLS]', 'This', 'is', 'an', 'input', 'example', '[SEP]']\n",
+      "Tokens id: [101, 1188, 1110, 1126, 7758, 1859, 102]\n",
+      "Tokens PyTorch: tensor([[ 101, 1188, 1110, 1126, 7758, 1859,  102]])\n",
+      "Tokenwise output: torch.Size([1, 7, 768]), Pooled output: torch.Size([1, 768])\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. \n",
+    "tokens = tokenizer.tokenize(\"This is an input example\")\n",
+    "print(\"Tokens: {}\".format(tokens))\n",
+    "\n",
+    "# This is not sufficient for the model, as it requires integers as input, \n",
+    "# not a problem, let's convert tokens to ids.\n",
+    "tokens_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
+    "print(\"Tokens id: {}\".format(tokens_ids))\n",
+    "\n",
+    "# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.\n",
+    "tokens_pt = torch.tensor([tokens_ids])\n",
+    "print(\"Tokens PyTorch: {}\".format(tokens_pt))\n",
+    "\n",
+    "# Now we're ready to go through BERT with out input\n",
+    "outputs, pooled = model(tokens_pt)\n",
+    "print(\"Tokenwise output: {}, Pooled output: {}\".format(outputs.shape, pooled.shape))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "As you can see, BERT outputs two tensors:\n",
+    " - One with the generated representation for every token in the input `(1, NB_TOKENS, REPRESENTATION_SIZE)`\n",
+    " - One with an aggregated representation for the whole input `(1, REPRESENTATION_SIZE)`\n",
+    " \n",
+    "The first, token-based, representation can be leveraged if your task requires to keep the sequence representation and you\n",
+    "want to operate at a token-level. This is particularly useful for Named Entity Recognition and Question-Answering.\n",
+    "\n",
+    "The second, aggregated, representation is especially useful if you need to extract the overall context of the sequence and don't\n",
+    "require a fine-grained token-leven. This is the case for Sentiment-Analysis of the sequence or Information Retrieval."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "The code you saw in the previous section introduced all the steps required to do simple model invocation.\n",
+    "For more day-to-day usage, transformers provides you higher-level methods which will makes your NLP journey easier\n",
+    "Let's improve our previous example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "input_ids:\n",
+      "\ttensor([[ 101, 1188, 1110, 1126, 7758, 1859,  102]])\n",
+      "token_type_ids:\n",
+      "\ttensor([[0, 0, 0, 0, 0, 0, 0]])\n",
+      "attention_mask:\n",
+      "\ttensor([[1, 1, 1, 1, 1, 1, 1]])\n",
+      "Difference with previous code: (0.0, 0.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# tokens = tokenizer.tokenize(\"This is an input example\")\n",
+    "# tokens_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
+    "# tokens_pt = torch.tensor([tokens_ids])\n",
+    "\n",
+    "# This code can be factored into one-line as follow\n",
+    "tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n",
+    "\n",
+    "for key, value in tokens_pt2.items():\n",
+    "    print(\"{}:\\n\\t{}\".format(key, value))\n",
+    "\n",
+    "outputs2, pooled2 = model(**tokens_pt2)\n",
+    "print(\"Difference with previous code: ({}, {})\".format((outputs2 - outputs).sum(), (pooled2 - pooled).sum()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see above, the methode `encode_plus` provides a convenient way to generate all the required parameters\n",
+    "that will go through the model. \n",
+    "\n",
+    "Moreover, you might have noticed it generated some additional tensors: \n",
+    "\n",
+    "- token_type_ids: This tensor will map every tokens to their corresponding segment (see below).\n",
+    "- attention_mask: This tensor is used to \"mask\" padded values in a batch of sequence with different lengths (see below)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Single segment token (str): ['[CLS]', 'This', 'is', 'a', 'sample', 'input', '[SEP]']\n",
+      "Single segment token (int): [101, 1188, 1110, 170, 6876, 7758, 102]\n",
+      "Single segment type       : [0, 0, 0, 0, 0, 0, 0]\n",
+      "\n",
+      "Multi segment token (str): ['[CLS]', 'This', 'is', 'segment', 'A', '[SEP]', 'This', 'is', 'segment', 'B', '[SEP]']\n",
+      "Multi segment token (int): [101, 1188, 1110, 6441, 138, 102, 1188, 1110, 6441, 139, 102]\n",
+      "Multi segment type       : [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Single segment input\n",
+    "single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n",
+    "\n",
+    "# Multiple segment input\n",
+    "multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n",
+    "\n",
+    "print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
+    "print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
+    "print(\"Single segment type       : {}\".format(single_seg_input['token_type_ids']))\n",
+    "\n",
+    "# Segments are concatened in the input to the model, with \n",
+    "print()\n",
+    "print(\"Multi segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))\n",
+    "print(\"Multi segment token (int): {}\".format(multi_seg_input['input_ids']))\n",
+    "print(\"Multi segment type       : {}\".format(multi_seg_input['token_type_ids']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokens (int)      : [101, 1188, 1110, 170, 6876, 102, 0, 0]\n",
+      "Tokens (str)      : ['[CLS]', 'This', 'is', 'a', 'sample', '[SEP]', '[PAD]', '[PAD]']\n",
+      "Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 0, 0]\n",
+      "\n",
+      "Tokens (int)      : [101, 1188, 1110, 1330, 2039, 6876, 3087, 102]\n",
+      "Tokens (str)      : ['[CLS]', 'This', 'is', 'another', 'longer', 'sample', 'text', '[SEP]']\n",
+      "Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Padding highlight\n",
+    "tokens = tokenizer.batch_encode_plus(\n",
+    "    [\"This is a sample\", \"This is another longer sample text\"], \n",
+    "    pad_to_max_length=True  # First sentence will have some PADDED tokens to match second sequence length\n",
+    ")\n",
+    "\n",
+    "for i in range(2):\n",
+    "    print(\"Tokens (int)      : {}\".format(tokens['input_ids'][i]))\n",
+    "    print(\"Tokens (str)      : {}\".format([tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]]))\n",
+    "    print(\"Tokens (attn_mask): {}\".format(tokens['attention_mask'][i]))\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Frameworks interoperability\n",
+    "\n",
+    "One of the most powerfull feature of transformers is its ability to seamlessly move from PyTorch to Tensorflow\n",
+    "without pain for the user.\n",
+    "\n",
+    "We provide some convenient methods to load TensorFlow pretrained weight insinde a PyTorch model and opposite."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TFBertModel, BertModel\n",
+    "\n",
+    "# Let's load a BERT model for TensorFlow and PyTorch\n",
+    "model_tf = TFBertModel.from_pretrained('bert-base-cased')\n",
+    "model_pt = BertModel.from_pretrained('bert-base-cased')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "output differences: 2.971128560602665e-05\n",
+      "pooled differences: -8.576549589633942e-06\n"
+     ]
+    }
+   ],
+   "source": [
+    "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
+    "input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
+    "input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n",
+    "\n",
+    "# Let's compare the outputs\n",
+    "output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
+    "\n",
+    "# Models outputs 2 values (The value for each tokens, the pooled representation of the input sentence)\n",
+    "# Here we compare the output differences between PyTorch and TensorFlow.\n",
+    "for name, o_tf, o_pt in zip([\"output\", \"pooled\"], output_tf, output_pt):\n",
+    "    print(\"{} differences: {}\".format(name, (o_tf.numpy() - o_pt.numpy()).sum()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Want it lighter? Faster? Let's talk distillation! \n",
+    "\n",
+    "One of the main concerns when using these Transformer based models is the computational power they require. All over this notebook we are using BERT model as it can be run on common machines but that's not the case for all of the models.\n",
+    "\n",
+    "For example, Google released a few months ago **T5** an Encoder/Decoder architecture based on Transformer and available in `transformers` with no more than 11 billions parameters. Microsoft also recently entered the game with **Turing-NLG** using 17 billions parameters. This kind of model requires tens of gigabytes to store the weights and a tremendous compute infrastructure to run such models which makes it impracticable for the common man !\n",
+    "\n",
+    "![transformers-parameters](https://lh5.googleusercontent.com/NRdXzEcgZV3ooykjIaTm9uvbr9QnSjDQHHAHb2kk_Lm9lIF0AhS-PJdXGzpcBDztax922XAp386hyNmWZYsZC1lUN2r4Ip5p9v-PHO19-jevRGg4iQFxgv5Olq4DWaqSA_8ptep7)\n",
+    "\n",
+    "With the goal of making Transformer-based NLP accessible to everyone we @huggingface developed models that take advantage of a training process called **Distillation** which allows us to drastically reduce the resources needed to run such models with almost zero drop in performances.\n",
+    "\n",
+    "Going over the whole Distillation process is out of the scope of this notebook, but if you want more information on the subject you may refer to [this Medium article written by my colleague Victor SANH, author of DistilBERT paper](https://medium.com/huggingface/distilbert-8cf3380435b5), you might also want to directly have a look at the paper [(Sanh & al., 2019)](https://arxiv.org/abs/1910.01108)\n",
+    "\n",
+    "Of course, in `transformers` we have distilled some models and made them available directly in the library ! "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 57.1 ms, sys: 2.44 ms, total: 59.5 ms\n",
+      "Wall time: 35.5 ms\n",
+      "CPU times: user 98.8 ms, sys: 725 µs, total: 99.5 ms\n",
+      "Wall time: 50 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import DistilBertModel\n",
+    "\n",
+    "bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
+    "input_pt = tokenizer.encode_plus(\n",
+    "    'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
+    "    return_tensors=\"pt\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "%time _ = bert_distil(input_pt['input_ids'])\n",
+    "%time _ = model_pt(input_pt['input_ids'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Community provided models\n",
+    "\n",
+    "Last but not least, earlier in this notebook we introduced Hugging Face `transformers` as a repository for the NLP community to exchange pretrained models. We wanted to highlight this features and all the possibilities it offers for the end-user.\n",
+    "\n",
+    "To leverage community pretrained models, just provide the organisation name and name of the model to `from_pretrained` and it will do all the magic for you ! \n",
+    "\n",
+    "\n",
+    "We currently have more 50 models provided by the community and more are added every day, don't hesitate to give it a try !"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's load German BERT from the Bavarian State Library\n",
+    "de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
+    "de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
+    "\n",
+    "de_input = de_tokenizer.encode_plus(\n",
+    "    \"Hugging Face ist einen französische Firma Mitarbeitern in New-York.\",\n",
+    "    return_tensors=\"pt\"\n",
+    ")\n",
+    "output_de, pooled_de = de_bert(**de_input)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/notebooks/03-pipelines.ipynb b/notebooks/03-pipelines.ipynb
new file mode 100644
index 0000000000..9a5b3f7c4f
--- /dev/null
+++ b/notebooks/03-pipelines.ipynb
@@ -0,0 +1,594 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## How can I leverage State-of-the-Art Natural Language Models with only one line of code ?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Newly introduced in transformers v2.3.0, **pipelines** provides a high-level, easy to use,\n",
+    "API for doing inference over a variety of downstream-tasks, including: \n",
+    "\n",
+    "- Sentence Classification (Sentiment Analysis): Indicate if the overall sentence is either positive or negative. _(Binary Classification task or Logitic Regression task)_\n",
+    "- Token Classification (Named Entity Recognition, Part-of-Speech tagging): For each sub-entities _(**tokens**)_ in the input, assign them a label _(Classification task)_.\n",
+    "- Question-Answering: Provided a tuple (question, context) the model should find the span of text in **content** answering the **question**.\n",
+    "- Mask-Filling: Suggests possible word(s) to fill the masked input with respect to the provided **context**.\n",
+    "- Feature Extraction: Maps the input to a higher, multi-dimensional space learned from the data.\n",
+    "\n",
+    "Pipelines encapsulate the overall process of every NLP process:\n",
+    " \n",
+    " 1. Tokenization: Split the initial input into multiple sub-entities with ... properties (i.e. tokens).\n",
+    " 2. Inference: Maps every tokens into a more meaningful representation. \n",
+    " 3. Decoding: Use the above representation to generate and/or extract the final output for the underlying task.\n",
+    "\n",
+    "The overall API is exposed to the end-user through the `pipeline()` method with the following \n",
+    "structure:\n",
+    "\n",
+    "```python\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "# Using default model and tokenizer for the task\n",
+    "pipeline(\"<task-name>\")\n",
+    "\n",
+    "# Using a user-specified model\n",
+    "pipeline(\"<task-name>\", model=\"<model_name>\")\n",
+    "\n",
+    "# Using custom model/tokenizer as str\n",
+    "pipeline('<task-name>', model='<model name>', tokenizer='<tokenizer_name>')\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code \n"
+    }
+   },
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "from __future__ imports must occur at the beginning of the file (<ipython-input-29-c3a037bd4c55>, line 5)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-29-c3a037bd4c55>\"\u001b[0;36m, line \u001b[0;32m5\u001b[0m\n\u001b[0;31m    from transformers import pipeline\u001b[0m\n\u001b[0m           ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m from __future__ imports must occur at the beginning of the file\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from __future__ import print_function\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "from transformers import pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## 1. Sentence Classification - Sentiment Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6aeccfdf51994149bdd1f3d3533e380f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'label': 'POSITIVE', 'score': 0.800251},\n",
+       " {'label': 'NEGATIVE', 'score': 1.2489903}]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp_sentence_classif = pipeline('sentiment-analysis')\n",
+    "nlp_sentence_classif(['Such a nice weather outside !', 'This movie was kind of boring.'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## 2. Token Classification - Named Entity Recognition"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b5549c53c27346a899af553c977f00bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'word': 'Hu', 'score': 0.9970937967300415, 'entity': 'I-ORG'},\n",
+       " {'word': '##gging', 'score': 0.9345750212669373, 'entity': 'I-ORG'},\n",
+       " {'word': 'Face', 'score': 0.9787060022354126, 'entity': 'I-ORG'},\n",
+       " {'word': 'French', 'score': 0.9981995820999146, 'entity': 'I-MISC'},\n",
+       " {'word': 'New', 'score': 0.9983047246932983, 'entity': 'I-LOC'},\n",
+       " {'word': '-', 'score': 0.8913455009460449, 'entity': 'I-LOC'},\n",
+       " {'word': 'York', 'score': 0.9979523420333862, 'entity': 'I-LOC'}]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp_token_class = pipeline('ner')\n",
+    "nlp_token_class('Hugging Face is a French company based in New-York.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Question Answering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6e56a8edcef44ec2ae838711ecd22d3a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 53.05it/s]\n",
+      "add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2673.23it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 0.9632966867654424, 'start': 42, 'end': 50, 'answer': 'New-York.'}"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp_qa = pipeline('question-answering')\n",
+    "nlp_qa(context='Hugging Face is a French company based in New-York.', question='Where is based Hugging Face ?')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Text Generation - Mask Filling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1930695ea2d24ca98c6d7c13842d377f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'sequence': '<s> Hugging Face is a French company based in Paris</s>',\n",
+       "  'score': 0.25288480520248413,\n",
+       "  'token': 2201},\n",
+       " {'sequence': '<s> Hugging Face is a French company based in Lyon</s>',\n",
+       "  'score': 0.07639515399932861,\n",
+       "  'token': 12790},\n",
+       " {'sequence': '<s> Hugging Face is a French company based in Brussels</s>',\n",
+       "  'score': 0.055500105023384094,\n",
+       "  'token': 6497},\n",
+       " {'sequence': '<s> Hugging Face is a French company based in Geneva</s>',\n",
+       "  'score': 0.04264815151691437,\n",
+       "  'token': 11559},\n",
+       " {'sequence': '<s> Hugging Face is a French company based in France</s>',\n",
+       "  'score': 0.03868963569402695,\n",
+       "  'token': 1470}]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp_fill = pipeline('fill-mask')\n",
+    "nlp_fill('Hugging Face is a French company based in <mask>')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Projection - Features Extraction "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "92fa4d67290f49a3943dc0abd7529892",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(1, 12, 768)"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "nlp_features = pipeline('feature-extraction')\n",
+    "output = nlp_features('Hugging Face is a French company based in Paris')\n",
+    "np.array(output).shape   # (Samples, Tokens, Vector Size)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Alright ! Now you have a nice picture of what is possible through transformers' pipelines, and there is more\n",
+    "to come in future releases. \n",
+    "\n",
+    "In the meantime, you can try the different pipelines with your own inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "261ae9fa30e84d1d84a3b0d9682ac477",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Task:', index=1, options=('sentiment-analysis', 'ner', 'fill_mask'), value='ner')"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ddc51b71c6eb40e5ab60998664e6a857",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Text(value='', description='Your input:', placeholder='Enter something')"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'word': 'Paris', 'score': 0.9991844296455383, 'entity': 'I-LOC'}]\n",
+      "[{'sequence': '<s> I\\'m from Paris.\"</s>', 'score': 0.224044069647789, 'token': 72}, {'sequence': \"<s> I'm from Paris.)</s>\", 'score': 0.16959427297115326, 'token': 1592}, {'sequence': \"<s> I'm from Paris.]</s>\", 'score': 0.10994981974363327, 'token': 21838}, {'sequence': '<s> I\\'m from Paris!\"</s>', 'score': 0.0706234946846962, 'token': 2901}, {'sequence': \"<s> I'm from Paris.</s>\", 'score': 0.0698278620839119, 'token': 4}]\n",
+      "[{'sequence': \"<s> I'm from Paris and London</s>\", 'score': 0.12238534539937973, 'token': 928}, {'sequence': \"<s> I'm from Paris and Brussels</s>\", 'score': 0.07107886672019958, 'token': 6497}, {'sequence': \"<s> I'm from Paris and Belgium</s>\", 'score': 0.040912602096796036, 'token': 7320}, {'sequence': \"<s> I'm from Paris and Berlin</s>\", 'score': 0.039884064346551895, 'token': 5459}, {'sequence': \"<s> I'm from Paris and Melbourne</s>\", 'score': 0.038133684545755386, 'token': 5703}]\n",
+      "[{'sequence': '<s> I like go to sleep</s>', 'score': 0.08942786604166031, 'token': 3581}, {'sequence': '<s> I like go to bed</s>', 'score': 0.07789064943790436, 'token': 3267}, {'sequence': '<s> I like go to concerts</s>', 'score': 0.06356740742921829, 'token': 12858}, {'sequence': '<s> I like go to school</s>', 'score': 0.03660670667886734, 'token': 334}, {'sequence': '<s> I like go to dinner</s>', 'score': 0.032155368477106094, 'token': 3630}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "task = widgets.Dropdown(\n",
+    "    options=['sentiment-analysis', 'ner', 'fill_mask'],\n",
+    "    value='ner',\n",
+    "    description='Task:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "input = widgets.Text(\n",
+    "    value='',\n",
+    "    placeholder='Enter something',\n",
+    "    description='Your input:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "def forward(_):\n",
+    "    if len(input.value) > 0: \n",
+    "        if task.value == 'ner':\n",
+    "            output = nlp_token_class(input.value)\n",
+    "        elif task.value == 'sentiment-analysis':\n",
+    "            output = nlp_sentence_classif(input.value)\n",
+    "        else:\n",
+    "            if input.value.find('<mask>') == -1:\n",
+    "                output = nlp_fill(input.value + ' <mask>')\n",
+    "            else:\n",
+    "                output = nlp_fill(input.value)                \n",
+    "        print(output)\n",
+    "\n",
+    "input.on_submit(forward)\n",
+    "display(task, input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% Question Answering\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5ae68677bd8a41f990355aa43840d3f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Textarea(value='Einstein is famous for the general theory of relativity', description='Context:', placeholder=…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "14bcfd9a2c5a47e6b1383989ab7632c8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Text(value='Why is Einstein famous for ?', description='Question:', placeholder='Enter something')"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 168.83it/s]\n",
+      "add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 1919.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'score': 0.40340670623875496, 'start': 27, 'end': 54, 'answer': 'general theory of relativity'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "context = widgets.Textarea(\n",
+    "    value='Einstein is famous for the general theory of relativity',\n",
+    "    placeholder='Enter something',\n",
+    "    description='Context:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "query = widgets.Text(\n",
+    "    value='Why is Einstein famous for ?',\n",
+    "    placeholder='Enter something',\n",
+    "    description='Question:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "def forward(_):\n",
+    "    if len(context.value) > 0 and len(query.value) > 0: \n",
+    "        output = nlp_qa(question=query.value, context=context.value)            \n",
+    "        print(output)\n",
+    "\n",
+    "query.on_submit(forward)\n",
+    "display(context, query)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/notebooks/Comparing-PT-and-TF-models.ipynb b/notebooks/Comparing-PT-and-TF-models.ipynb
deleted file mode 100644
index 321c2ebe30..0000000000
--- a/notebooks/Comparing-PT-and-TF-models.ipynb
+++ /dev/null
@@ -1,1630 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Pytorch to Tensorflow Conversion Test Notebook\n",
-    "\n",
-    "To run this notebook follow these steps, modifying the **Config** section as necessary:\n",
-    "\n",
-    "1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\n",
-    "2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\n",
-    "\n",
-    "Note: \n",
-    "1. This feature currently only supports the base BERT models (uncased/cased).\n",
-    "2. Tensorflow model will be dumped in `tf_model_dir`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "model_cls  = 'BertModel'\n",
-    "model_typ  = 'bert-base-uncased'\n",
-    "token_cls  = 'BertTokenizer'\n",
-    "max_seq    = 12\n",
-    "CLS        = \"[CLS]\"\n",
-    "SEP        = \"[SEP]\"\n",
-    "MASK       = \"[MASK]\"\n",
-    "CLS_IDX    = 0\n",
-    "layer_idxs = tuple(range(12))\n",
-    "input_text = \"jim henson was a puppeteer\"\n",
-    "\n",
-    "pt_model_dir = \"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\".format(model_typ)\n",
-    "tf_bert_dir  = \"/home/ubuntu/bert\"\n",
-    "\n",
-    "pt_vocab_file  = os.path.join(pt_model_dir, \"vocab.txt\")\n",
-    "pt_init_ckpt   = os.path.join(pt_model_dir, model_typ.replace(\"-\", \"_\") + \".bin\")\n",
-    "tf_model_dir   = os.path.join(pt_model_dir, 'tf')\n",
-    "tf_vocab_file  = os.path.join(tf_model_dir, \"vocab.txt\")\n",
-    "tf_init_ckpt   = os.path.join(tf_model_dir, model_typ.replace(\"-\", \"_\") + \".ckpt\")\n",
-    "tf_config_file = os.path.join(tf_model_dir, \"bert_config.json\")\n",
-    "\n",
-    "if not os.path.isdir(tf_model_dir): \n",
-    "    os.makedirs(tf_model_dir, exist_ok=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tokenization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def tokenize(text, tokenizer):\n",
-    "    text = text.strip().lower()\n",
-    "    tok_ids = tokenizer.tokenize(text)\n",
-    "    if len(tok_ids) > max_seq - 2:\n",
-    "        tok_ids = tok_ids[:max_seq - 2]\n",
-    "    tok_ids.insert(CLS_IDX, CLS)\n",
-    "    tok_ids.append(SEP)\n",
-    "    input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\n",
-    "    mask_ids = [1] * len(input_ids)\n",
-    "    seg_ids = [0] * len(input_ids)\n",
-    "    padding = [0] * (max_seq - len(input_ids))\n",
-    "    input_ids += padding\n",
-    "    mask_ids += padding\n",
-    "    seg_ids += padding\n",
-    "    return input_ids, mask_ids, seg_ids"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Pytorch execution"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\n",
-      "100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Pytorch embedding shape: (1, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "import torch\n",
-    "from pytorch_pretrained_bert import (BertConfig,\n",
-    "                                     BertModel, \n",
-    "                                     BertTokenizer, \n",
-    "                                     BertForSequenceClassification)\n",
-    "\n",
-    "# Save Vocab\n",
-    "pt_tokenizer = BertTokenizer.from_pretrained(\n",
-    "    pretrained_model_name_or_path=model_typ, \n",
-    "    cache_dir=pt_model_dir)\n",
-    "pt_tokenizer.save_vocabulary(pt_model_dir)\n",
-    "pt_tokenizer.save_vocabulary(tf_model_dir)\n",
-    "\n",
-    "# Save Model\n",
-    "pt_model = BertModel.from_pretrained(\n",
-    "    pretrained_model_name_or_path=model_typ, \n",
-    "    cache_dir=pt_model_dir).to('cpu')\n",
-    "pt_model.eval()\n",
-    "pt_model.config.hidden_dropout_prob = 0.0\n",
-    "pt_model.config.attention_probs_dropout_prob = 0.0\n",
-    "pt_model.config.to_json_file(tf_config_file)\n",
-    "torch.save(pt_model.state_dict(), pt_init_ckpt)\n",
-    "\n",
-    "# Inputs\n",
-    "input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\n",
-    "\n",
-    "# PT Embedding\n",
-    "tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\n",
-    "seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\n",
-    "msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\n",
-    "attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\n",
-    "pt_embedding = nsp_logits.detach().numpy() \n",
-    "print(\"Pytorch embedding shape: {}\".format(pt_embedding.shape))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Pytorch &rarr; Tensorflow conversion"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Colocations handled automatically by placer.\n",
-      "bert/embeddings/word_embeddings                             initialized\n",
-      "bert/embeddings/position_embeddings                         initialized\n",
-      "bert/embeddings/token_type_embeddings                       initialized\n",
-      "bert/embeddings/LayerNorm/gamma                             initialized\n",
-      "bert/embeddings/LayerNorm/beta                              initialized\n",
-      "bert/encoder/layer_0/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_0/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_0/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_0/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_0/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_0/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_0/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_0/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_0/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_0/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_0/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_0/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_0/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_0/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_1/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_1/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_1/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_1/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_1/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_1/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_1/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_1/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_1/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_1/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_1/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_1/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_1/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_1/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_2/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_2/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_2/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_2/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_2/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_2/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_2/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_2/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_2/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_2/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_2/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_2/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_2/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_2/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_3/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_3/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_3/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_3/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_3/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_3/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_3/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_3/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_3/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_3/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_3/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_3/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_3/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_3/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_4/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_4/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_4/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_4/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_4/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_4/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_4/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_4/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_4/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_4/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_4/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_4/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_4/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_4/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_5/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_5/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_5/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_5/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_5/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_5/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_5/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_5/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_5/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_5/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_5/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_5/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_5/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_5/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_6/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_6/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_6/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_6/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_6/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_6/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_6/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_6/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_6/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_6/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_6/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_6/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_6/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_6/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_7/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_7/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_7/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_7/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_7/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_7/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_7/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_7/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_7/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_7/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_7/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_7/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_7/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_7/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_8/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_8/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_8/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_8/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_8/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_8/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_8/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_8/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_8/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_8/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_8/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_8/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_8/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_8/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_9/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_9/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_9/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_9/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_9/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_9/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_9/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_9/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_9/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_9/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_9/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_9/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_9/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_9/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_10/attention/self/query/kernel           initialized\n",
-      "bert/encoder/layer_10/attention/self/query/bias             initialized\n",
-      "bert/encoder/layer_10/attention/self/key/kernel             initialized\n",
-      "bert/encoder/layer_10/attention/self/key/bias               initialized\n",
-      "bert/encoder/layer_10/attention/self/value/kernel           initialized\n",
-      "bert/encoder/layer_10/attention/self/value/bias             initialized\n",
-      "bert/encoder/layer_10/attention/output/dense/kernel         initialized\n",
-      "bert/encoder/layer_10/attention/output/dense/bias           initialized\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/gamma      initialized\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/beta       initialized\n",
-      "bert/encoder/layer_10/intermediate/dense/kernel             initialized\n",
-      "bert/encoder/layer_10/intermediate/dense/bias               initialized\n",
-      "bert/encoder/layer_10/output/dense/kernel                   initialized\n",
-      "bert/encoder/layer_10/output/dense/bias                     initialized\n",
-      "bert/encoder/layer_10/output/LayerNorm/gamma                initialized\n",
-      "bert/encoder/layer_10/output/LayerNorm/beta                 initialized\n",
-      "bert/encoder/layer_11/attention/self/query/kernel           initialized\n",
-      "bert/encoder/layer_11/attention/self/query/bias             initialized\n",
-      "bert/encoder/layer_11/attention/self/key/kernel             initialized\n",
-      "bert/encoder/layer_11/attention/self/key/bias               initialized\n",
-      "bert/encoder/layer_11/attention/self/value/kernel           initialized\n",
-      "bert/encoder/layer_11/attention/self/value/bias             initialized\n",
-      "bert/encoder/layer_11/attention/output/dense/kernel         initialized\n",
-      "bert/encoder/layer_11/attention/output/dense/bias           initialized\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/gamma      initialized\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/beta       initialized\n",
-      "bert/encoder/layer_11/intermediate/dense/kernel             initialized\n",
-      "bert/encoder/layer_11/intermediate/dense/bias               initialized\n",
-      "bert/encoder/layer_11/output/dense/kernel                   initialized\n",
-      "bert/encoder/layer_11/output/dense/bias                     initialized\n",
-      "bert/encoder/layer_11/output/LayerNorm/gamma                initialized\n",
-      "bert/encoder/layer_11/output/LayerNorm/beta                 initialized\n",
-      "bert/pooler/dense/kernel                                    initialized\n",
-      "bert/pooler/dense/bias                                      initialized\n"
-     ]
-    }
-   ],
-   "source": [
-    "from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\n",
-    "\n",
-    "main([\n",
-    "    '--model_name', model_typ, \n",
-    "    '--pytorch_model_path', pt_init_ckpt,\n",
-    "    '--tf_cache_dir', tf_model_dir,\n",
-    "    '--cache_dir', pt_model_dir\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Tensorflow execution"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
-      "For more information, please see:\n",
-      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
-      "  * https://github.com/tensorflow/addons\n",
-      "If you depend on functionality not listed there, please file an issue.\n",
-      "\n",
-      "WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Use keras.layers.dense instead.\n",
-      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Use standard file APIs to check for files with this prefix.\n",
-      "INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\n",
-      "Tensorflow embedding shape: (1, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import tensorflow as tf\n",
-    "sys.path.insert(0, tf_bert_dir)\n",
-    "import modeling\n",
-    "import tokenization\n",
-    "\n",
-    "tf.reset_default_graph()\n",
-    "\n",
-    "# Process text\n",
-    "tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\n",
-    "\n",
-    "# Graph inputs\n",
-    "input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\n",
-    "config = modeling.BertConfig.from_json_file(\n",
-    "    os.path.join(tf_model_dir, 'bert_config.json'))\n",
-    "input_tensor = tf.placeholder(\n",
-    "    dtype=tf.int32,\n",
-    "    shape=[1, None],\n",
-    "    name='input_ids')\n",
-    "mask_tensor = tf.placeholder(\n",
-    "    dtype=tf.int32,\n",
-    "    shape=[1, None],\n",
-    "    name='mask_ids')\n",
-    "seg_tensor = tf.placeholder(\n",
-    "    dtype=tf.int32,\n",
-    "    shape=[1, None],\n",
-    "    name='seg_ids')\n",
-    "tf_model = modeling.BertModel(\n",
-    "    config=config,\n",
-    "    is_training=False,\n",
-    "    input_ids=input_tensor,\n",
-    "    input_mask=mask_tensor,\n",
-    "    token_type_ids=seg_tensor,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "output_layer = tf_model.get_pooled_output()\n",
-    "\n",
-    "# Load tf model\n",
-    "session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
-    "vars_to_load = [v for v in tf.global_variables()]\n",
-    "session.run(tf.variables_initializer(var_list=vars_to_load))\n",
-    "saver = tf.train.Saver(vars_to_load)\n",
-    "saver.restore(session, save_path=tf_init_ckpt)\n",
-    "\n",
-    "# TF Embedding\n",
-    "fetches = output_layer\n",
-    "feed_dict  = {\n",
-    "    input_tensor: [input_ids_tf],\n",
-    "    mask_tensor: [mask_ids_tf],\n",
-    "    seg_tensor: [seg_ids_tf]\n",
-    "}\n",
-    "tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\n",
-    "print(\"Tensorflow embedding shape: {}\".format(tf_embedding.shape))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare Tokenization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
-      "TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
-      "SEG_IDS_PT:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "SEG_IDS_TF:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "MASK_IDS_PT:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n",
-      "MASK_IDS_TF:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"TOKEN_IDS_PT: {}\".format(input_ids_pt))\n",
-    "print(\"TOKEN_IDS_TF: {}\".format(input_ids_tf))\n",
-    "print(\"SEG_IDS_PT:   {}\".format(seg_ids_pt))\n",
-    "print(\"SEG_IDS_TF:   {}\".format(seg_ids_tf))\n",
-    "print(\"MASK_IDS_PT:  {}\".format(mask_ids_pt))\n",
-    "print(\"MASK_IDS_TF:  {}\".format(mask_ids_tf))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare Model Weights"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bert/embeddings/word_embeddings\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
-      "TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
-      "\n",
-      "bert/embeddings/token_type_embeddings\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
-      "TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
-      "\n",
-      "bert/embeddings/position_embeddings\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
-      "TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
-      "\n",
-      "bert/embeddings/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
-      "TF: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
-      "\n",
-      "bert/embeddings/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
-      "TF: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
-      "TF: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
-      "TF: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
-      "TF: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
-      "TF: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
-      "TF: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
-      "TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
-      "TF: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
-      "TF: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
-      "TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
-      "TF: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
-      "\n",
-      "bert/encoder/layer_0/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
-      "TF: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
-      "\n",
-      "bert/encoder/layer_0/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
-      "TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
-      "\n",
-      "bert/encoder/layer_0/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
-      "TF: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
-      "\n",
-      "bert/encoder/layer_0/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
-      "TF: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
-      "\n",
-      "bert/encoder/layer_0/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
-      "TF: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
-      "\n",
-      "bert/encoder/layer_0/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
-      "TF: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
-      "TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
-      "TF: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
-      "TF: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
-      "TF: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
-      "TF: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
-      "TF: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
-      "TF: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
-      "TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
-      "TF: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
-      "TF: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
-      "\n",
-      "bert/encoder/layer_1/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
-      "TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
-      "\n",
-      "bert/encoder/layer_1/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
-      "TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
-      "\n",
-      "bert/encoder/layer_1/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
-      "TF: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
-      "\n",
-      "bert/encoder/layer_1/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
-      "TF: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
-      "\n",
-      "bert/encoder/layer_1/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
-      "TF: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
-      "\n",
-      "bert/encoder/layer_1/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
-      "TF: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
-      "TF: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
-      "TF: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
-      "TF: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
-      "TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
-      "TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
-      "TF: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
-      "TF: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
-      "TF: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
-      "TF: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
-      "TF: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
-      "\n",
-      "bert/encoder/layer_2/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
-      "TF: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
-      "\n",
-      "bert/encoder/layer_2/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
-      "TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
-      "\n",
-      "bert/encoder/layer_2/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
-      "TF: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
-      "\n",
-      "bert/encoder/layer_2/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
-      "TF: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
-      "\n",
-      "bert/encoder/layer_2/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
-      "TF: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
-      "\n",
-      "bert/encoder/layer_2/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
-      "TF: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
-      "TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
-      "TF: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
-      "TF: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
-      "TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
-      "TF: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
-      "TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
-      "TF: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
-      "TF: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
-      "TF: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
-      "TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
-      "\n",
-      "bert/encoder/layer_3/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
-      "TF: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
-      "\n",
-      "bert/encoder/layer_3/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
-      "TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
-      "\n",
-      "bert/encoder/layer_3/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
-      "TF: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
-      "\n",
-      "bert/encoder/layer_3/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
-      "TF: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
-      "\n",
-      "bert/encoder/layer_3/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
-      "TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
-      "\n",
-      "bert/encoder/layer_3/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
-      "TF: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
-      "TF: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
-      "TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
-      "TF: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
-      "TF: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
-      "TF: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
-      "TF: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
-      "TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
-      "TF: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
-      "TF: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
-      "TF: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
-      "\n",
-      "bert/encoder/layer_4/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
-      "TF: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
-      "\n",
-      "bert/encoder/layer_4/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
-      "TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
-      "\n",
-      "bert/encoder/layer_4/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
-      "TF: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
-      "\n",
-      "bert/encoder/layer_4/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
-      "TF: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
-      "\n",
-      "bert/encoder/layer_4/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
-      "TF: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
-      "\n",
-      "bert/encoder/layer_4/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
-      "TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
-      "TF: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
-      "TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
-      "TF: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
-      "TF: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
-      "TF: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
-      "TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
-      "TF: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
-      "TF: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
-      "TF: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
-      "TF: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
-      "\n",
-      "bert/encoder/layer_5/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
-      "TF: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
-      "\n",
-      "bert/encoder/layer_5/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
-      "TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
-      "\n",
-      "bert/encoder/layer_5/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
-      "TF: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
-      "\n",
-      "bert/encoder/layer_5/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
-      "TF: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
-      "\n",
-      "bert/encoder/layer_5/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
-      "TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
-      "\n",
-      "bert/encoder/layer_5/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
-      "TF: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
-      "TF: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
-      "TF: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
-      "TF: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
-      "TF: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
-      "TF: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
-      "TF: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
-      "TF: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
-      "TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
-      "TF: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
-      "TF: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
-      "\n",
-      "bert/encoder/layer_6/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
-      "TF: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
-      "\n",
-      "bert/encoder/layer_6/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
-      "TF: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
-      "\n",
-      "bert/encoder/layer_6/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
-      "TF: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
-      "\n",
-      "bert/encoder/layer_6/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
-      "TF: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
-      "\n",
-      "bert/encoder/layer_6/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
-      "TF: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
-      "\n",
-      "bert/encoder/layer_6/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
-      "TF: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
-      "TF: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
-      "TF: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
-      "TF: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
-      "TF: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
-      "TF: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
-      "TF: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
-      "TF: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
-      "TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
-      "TF: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
-      "TF: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
-      "\n",
-      "bert/encoder/layer_7/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
-      "TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
-      "\n",
-      "bert/encoder/layer_7/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
-      "TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
-      "\n",
-      "bert/encoder/layer_7/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
-      "TF: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
-      "\n",
-      "bert/encoder/layer_7/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
-      "TF: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
-      "\n",
-      "bert/encoder/layer_7/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
-      "TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
-      "\n",
-      "bert/encoder/layer_7/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
-      "TF: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
-      "TF: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
-      "TF: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
-      "TF: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
-      " -4.4074579e-04]\n",
-      "TF: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
-      " -4.4074579e-04]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
-      "TF: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
-      "TF: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
-      "TF: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
-      "TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
-      "TF: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
-      "TF: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
-      "\n",
-      "bert/encoder/layer_8/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
-      "TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
-      "\n",
-      "bert/encoder/layer_8/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
-      "TF: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
-      "\n",
-      "bert/encoder/layer_8/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
-      "TF: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
-      "\n",
-      "bert/encoder/layer_8/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
-      "TF: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
-      "\n",
-      "bert/encoder/layer_8/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
-      "TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
-      "\n",
-      "bert/encoder/layer_8/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
-      "TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
-      "TF: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
-      "TF: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
-      "TF: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
-      "TF: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
-      "TF: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
-      "TF: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
-      "TF: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
-      "TF: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
-      "TF: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
-      "TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
-      "\n",
-      "bert/encoder/layer_9/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
-      "TF: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
-      "\n",
-      "bert/encoder/layer_9/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
-      "TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
-      "\n",
-      "bert/encoder/layer_9/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
-      "TF: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
-      "\n",
-      "bert/encoder/layer_9/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
-      "TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
-      "\n",
-      "bert/encoder/layer_9/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
-      "TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
-      "\n",
-      "bert/encoder/layer_9/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
-      "TF: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
-      "TF: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
-      "TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
-      "TF: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
-      "TF: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
-      "TF: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
-      "TF: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
-      "TF: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
-      "TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
-      "TF: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
-      "TF: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
-      "\n",
-      "bert/encoder/layer_10/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
-      "TF: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
-      "\n",
-      "bert/encoder/layer_10/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
-      "TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
-      "\n",
-      "bert/encoder/layer_10/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
-      "TF: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
-      "\n",
-      "bert/encoder/layer_10/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
-      "TF: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
-      "\n",
-      "bert/encoder/layer_10/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
-      "TF: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
-      "\n",
-      "bert/encoder/layer_10/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
-      "TF: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
-      "TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
-      "TF: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
-      "TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
-      "TF: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
-      "TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
-      "TF: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
-      "TF: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
-      "TF: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
-      "TF: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
-      "TF: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
-      "\n",
-      "bert/encoder/layer_11/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
-      "TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
-      "\n",
-      "bert/encoder/layer_11/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
-      "TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
-      "\n",
-      "bert/encoder/layer_11/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
-      "TF: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
-      "\n",
-      "bert/encoder/layer_11/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
-      "TF: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
-      "\n",
-      "bert/encoder/layer_11/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
-      "TF: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
-      "\n",
-      "bert/encoder/layer_11/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
-      "TF: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
-      "\n",
-      "bert/pooler/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
-      "TF: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
-      "\n",
-      "bert/pooler/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
-      "TF: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensors_to_transopse = (\n",
-    "    \"dense.weight\",\n",
-    "    \"attention.self.query\",\n",
-    "    \"attention.self.key\",\n",
-    "    \"attention.self.value\"\n",
-    ")\n",
-    "var_map = (\n",
-    "    ('layer.', 'layer_'),\n",
-    "    ('word_embeddings.weight', 'word_embeddings'),\n",
-    "    ('position_embeddings.weight', 'position_embeddings'),\n",
-    "    ('token_type_embeddings.weight', 'token_type_embeddings'),\n",
-    "    ('.', '/'),\n",
-    "    ('LayerNorm/weight', 'LayerNorm/gamma'),\n",
-    "    ('LayerNorm/bias', 'LayerNorm/beta'),\n",
-    "    ('weight', 'kernel')\n",
-    ")\n",
-    "\n",
-    "def to_tf_var_name(name:str):\n",
-    "    for patt, repl in iter(var_map):\n",
-    "        name = name.replace(patt, repl)\n",
-    "    return 'bert/{}'.format(name)\n",
-    "\n",
-    "tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\n",
-    "pt_vars = {}\n",
-    "for v, T in pt_model.state_dict().items():\n",
-    "    T = T.detach().numpy()\n",
-    "    if any([x in v for x in tensors_to_transopse]):\n",
-    "        T = T.T\n",
-    "    pt_vars.update({to_tf_var_name(v): T})\n",
-    "\n",
-    "for var_name in tf_vars:\n",
-    "    \n",
-    "    pt = pt_vars[var_name.strip(\":0\")]\n",
-    "    tf = tf_vars[var_name]\n",
-    "\n",
-    "    print(var_name.strip(\":0\"))\n",
-    "    \n",
-    "    # Assert equivalence\n",
-    "    print(\"|sum(pt_wts - tf_wts)| = {}\".format(\n",
-    "        np.abs(np.sum(pt - tf, keepdims=False))\n",
-    "    ))\n",
-    "    assert not np.sum(pt - tf, keepdims=False)\n",
-    "    \n",
-    "    if len(pt.shape) == 2:\n",
-    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[0, :5]))\n",
-    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[0, :5]))\n",
-    "    else:\n",
-    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[:5]))\n",
-    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[:5]))\n",
-    "    print()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare Layer-12 Projections"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "MSE: 2.7155439966009e-05\n",
-      "PT-values: [-0.876663   -0.41088238 -0.12200808  0.44941     0.19445966]\n",
-      "TF-values: [-0.8742865  -0.40621698 -0.10585472  0.444904    0.1825743 ]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Mean Squared Error (MSE) between last projection of each model\n",
-    "MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\n",
-    "print(\"MSE: {}\".format(MSE))\n",
-    "print(\"PT-values: {}\".format(pt_embedding[0, :5]))\n",
-    "print(\"TF-values: {}\".format(tf_embedding[0, :5]))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "nlp",
-   "language": "python",
-   "name": "nlp"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
deleted file mode 100644
index 809f6ea6e0..0000000000
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ /dev/null
@@ -1,4815 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing TensorFlow (original) and PyTorch models\n",
-    "\n",
-    "You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\n",
-    "\n",
-    "To run this notebook, follow these instructions:\n",
-    "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n",
-    "- download the original TensorFlow implementation,\n",
-    "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n",
-    "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n",
-    "\n",
-    "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:26.999106Z",
-     "start_time": "2018-11-16T10:02:26.985709Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.chdir('../')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1/ TensorFlow code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:27.664528Z",
-     "start_time": "2018-11-16T10:02:27.651019Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "original_tf_inplem_dir = \"./tensorflow_code/\"\n",
-    "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n",
-    "\n",
-    "vocab_file = model_dir + \"vocab.txt\"\n",
-    "bert_config_file = model_dir + \"bert_config.json\"\n",
-    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
-    "\n",
-    "input_file = \"./samples/input.txt\"\n",
-    "max_seq_length = 128\n",
-    "max_predictions_per_seq = 20\n",
-    "\n",
-    "masked_lm_positions = [6]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:30.202182Z",
-     "start_time": "2018-11-16T10:02:28.112570Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import importlib.util\n",
-    "import sys\n",
-    "import tensorflow as tf\n",
-    "import pytorch_transformers as ppb\n",
-    "\n",
-    "def del_all_flags(FLAGS):\n",
-    "    flags_dict = FLAGS._flags()    \n",
-    "    keys_list = [keys for keys in flags_dict]    \n",
-    "    for keys in keys_list:\n",
-    "        FLAGS.__delattr__(keys)\n",
-    "\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.extract_features as ef\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.modeling as tfm\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.tokenization as tft\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.run_pretraining as rp\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.create_pretraining_data as cpp"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:30.238027Z",
-     "start_time": "2018-11-16T10:02:30.204943Z"
-    },
-    "code_folding": [
-     15
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "class InputExample(object):\n",
-    "    \"\"\"A single instance example.\"\"\"\n",
-    "\n",
-    "    def __init__(self, tokens, segment_ids, masked_lm_positions,\n",
-    "                 masked_lm_labels, is_random_next):\n",
-    "        self.tokens = tokens\n",
-    "        self.segment_ids = segment_ids\n",
-    "        self.masked_lm_positions = masked_lm_positions\n",
-    "        self.masked_lm_labels = masked_lm_labels\n",
-    "        self.is_random_next = is_random_next\n",
-    "    def __repr__(self):\n",
-    "        return '\\n'.join(k + \":\" + str(v) for k, v in self.__dict__.items())\n",
-    "\n",
-    "\n",
-    "def read_examples(input_file, tokenizer, masked_lm_positions):\n",
-    "    \"\"\"Read a list of `InputExample`s from an input file.\"\"\"\n",
-    "    examples = []\n",
-    "    unique_id = 0\n",
-    "    with tf.gfile.GFile(input_file, \"r\") as reader:\n",
-    "        while True:\n",
-    "            line = reader.readline()\n",
-    "            if not line:\n",
-    "                break\n",
-    "            line = line.strip()\n",
-    "            text_a = None\n",
-    "            text_b = None\n",
-    "            m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n",
-    "            if m is None:\n",
-    "                text_a = line\n",
-    "            else:\n",
-    "                text_a = m.group(1)\n",
-    "                text_b = m.group(2)\n",
-    "            tokens_a = tokenizer.tokenize(text_a)\n",
-    "            tokens_b = None\n",
-    "            if text_b:\n",
-    "                tokens_b = tokenizer.tokenize(text_b)\n",
-    "            tokens = tokens_a + tokens_b\n",
-    "            masked_lm_labels = []\n",
-    "            for m_pos in masked_lm_positions:\n",
-    "                masked_lm_labels.append(tokens[m_pos])\n",
-    "                tokens[m_pos] = '[MASK]'\n",
-    "            examples.append(\n",
-    "                InputExample(\n",
-    "                    tokens = tokens,\n",
-    "                    segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b),\n",
-    "                    masked_lm_positions = masked_lm_positions,\n",
-    "                    masked_lm_labels = masked_lm_labels,\n",
-    "                    is_random_next = False))\n",
-    "            unique_id += 1\n",
-    "    return examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:30.304018Z",
-     "start_time": "2018-11-16T10:02:30.240189Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tokens:['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']\n",
-      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n",
-      "masked_lm_positions:[6]\n",
-      "masked_lm_labels:['henson']\n",
-      "is_random_next:False\n"
-     ]
-    }
-   ],
-   "source": [
-    "bert_config = tfm.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = ppb.BertTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "examples = read_examples(input_file, tokenizer, masked_lm_positions=masked_lm_positions)\n",
-    "\n",
-    "print(examples[0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:33.324167Z",
-     "start_time": "2018-11-16T10:02:33.291909Z"
-    },
-    "code_folding": [
-     16
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "class InputFeatures(object):\n",
-    "    \"\"\"A single set of features of data.\"\"\"\n",
-    "\n",
-    "    def __init__(self, input_ids, input_mask, segment_ids, masked_lm_positions,\n",
-    "                 masked_lm_ids, masked_lm_weights, next_sentence_label):\n",
-    "        self.input_ids = input_ids\n",
-    "        self.input_mask = input_mask\n",
-    "        self.segment_ids = segment_ids\n",
-    "        self.masked_lm_positions = masked_lm_positions\n",
-    "        self.masked_lm_ids = masked_lm_ids\n",
-    "        self.masked_lm_weights = masked_lm_weights\n",
-    "        self.next_sentence_labels = next_sentence_label\n",
-    "\n",
-    "    def __repr__(self):\n",
-    "        return '\\n'.join(k + \":\" + str(v) for k, v in self.__dict__.items())\n",
-    "\n",
-    "def pretraining_convert_examples_to_features(instances, tokenizer, max_seq_length,\n",
-    "                                 max_predictions_per_seq):\n",
-    "    \"\"\"Create TF example files from `TrainingInstance`s.\"\"\"\n",
-    "    features = []\n",
-    "    for (inst_index, instance) in enumerate(instances):\n",
-    "        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)\n",
-    "        input_mask = [1] * len(input_ids)\n",
-    "        segment_ids = list(instance.segment_ids)\n",
-    "        assert len(input_ids) <= max_seq_length\n",
-    "\n",
-    "        while len(input_ids) < max_seq_length:\n",
-    "            input_ids.append(0)\n",
-    "            input_mask.append(0)\n",
-    "            segment_ids.append(0)\n",
-    "\n",
-    "        assert len(input_ids) == max_seq_length\n",
-    "        assert len(input_mask) == max_seq_length\n",
-    "        assert len(segment_ids) == max_seq_length\n",
-    "\n",
-    "        masked_lm_positions = list(instance.masked_lm_positions)\n",
-    "        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)\n",
-    "        masked_lm_weights = [1.0] * len(masked_lm_ids)\n",
-    "\n",
-    "        while len(masked_lm_positions) < max_predictions_per_seq:\n",
-    "            masked_lm_positions.append(0)\n",
-    "            masked_lm_ids.append(0)\n",
-    "            masked_lm_weights.append(0.0)\n",
-    "\n",
-    "        next_sentence_label = 1 if instance.is_random_next else 0\n",
-    "\n",
-    "        features.append(\n",
-    "            InputFeatures(input_ids, input_mask, segment_ids,\n",
-    "                          masked_lm_positions, masked_lm_ids,\n",
-    "                          masked_lm_weights, next_sentence_label))\n",
-    "\n",
-    "        if inst_index < 5:\n",
-    "            tf.logging.info(\"*** Example ***\")\n",
-    "            tf.logging.info(\"tokens: %s\" % \" \".join(\n",
-    "                [str(x) for x in instance.tokens]))\n",
-    "            tf.logging.info(\"features: %s\" % str(features[-1]))\n",
-    "    return features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:34.185367Z",
-     "start_time": "2018-11-16T10:02:34.155046Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:34 - INFO - tensorflow -   *** Example ***\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:tokens: who was jim henson ? jim [MASK] was a puppet ##eer\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:34 - INFO - tensorflow -   tokens: who was jim henson ? jim [MASK] was a puppet ##eer\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n",
-      "next_sentence_labels:0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:34 - INFO - tensorflow -   features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n",
-      "next_sentence_labels:0\n"
-     ]
-    }
-   ],
-   "source": [
-    "features = pretraining_convert_examples_to_features(\n",
-    "    instances=examples, max_seq_length=max_seq_length, \n",
-    "    max_predictions_per_seq=max_predictions_per_seq, tokenizer=tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:34.912005Z",
-     "start_time": "2018-11-16T10:02:34.882111Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def input_fn_builder(features, seq_length, max_predictions_per_seq, tokenizer):\n",
-    "    \"\"\"Creates an `input_fn` closure to be passed to TPUEstimator.\"\"\"\n",
-    "\n",
-    "    all_input_ids = []\n",
-    "    all_input_mask = []\n",
-    "    all_segment_ids = []\n",
-    "    all_masked_lm_positions = []\n",
-    "    all_masked_lm_ids = []\n",
-    "    all_masked_lm_weights = []\n",
-    "    all_next_sentence_labels = []\n",
-    "\n",
-    "    for feature in features:\n",
-    "        all_input_ids.append(feature.input_ids)\n",
-    "        all_input_mask.append(feature.input_mask)\n",
-    "        all_segment_ids.append(feature.segment_ids)\n",
-    "        all_masked_lm_positions.append(feature.masked_lm_positions)\n",
-    "        all_masked_lm_ids.append(feature.masked_lm_ids)\n",
-    "        all_masked_lm_weights.append(feature.masked_lm_weights)\n",
-    "        all_next_sentence_labels.append(feature.next_sentence_labels)\n",
-    "\n",
-    "    def input_fn(params):\n",
-    "        \"\"\"The actual input function.\"\"\"\n",
-    "        batch_size = params[\"batch_size\"]\n",
-    "\n",
-    "        num_examples = len(features)\n",
-    "\n",
-    "        # This is for demo purposes and does NOT scale to large data sets. We do\n",
-    "        # not use Dataset.from_generator() because that uses tf.py_func which is\n",
-    "        # not TPU compatible. The right way to load data is with TFRecordReader.\n",
-    "        d = tf.data.Dataset.from_tensor_slices({\n",
-    "            \"input_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_input_ids, shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"input_mask\":\n",
-    "                tf.constant(\n",
-    "                    all_input_mask,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"segment_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_segment_ids,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"masked_lm_positions\":\n",
-    "                tf.constant(\n",
-    "                    all_masked_lm_positions,\n",
-    "                    shape=[num_examples, max_predictions_per_seq],\n",
-    "                    dtype=tf.int32),\n",
-    "        \"masked_lm_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_masked_lm_ids,\n",
-    "                    shape=[num_examples, max_predictions_per_seq],\n",
-    "                    dtype=tf.int32),\n",
-    "        \"masked_lm_weights\":\n",
-    "                tf.constant(\n",
-    "                    all_masked_lm_weights,\n",
-    "                    shape=[num_examples, max_predictions_per_seq],\n",
-    "                    dtype=tf.float32),\n",
-    "        \"next_sentence_labels\":\n",
-    "                tf.constant(\n",
-    "                    all_next_sentence_labels,\n",
-    "                    shape=[num_examples, 1],\n",
-    "                    dtype=tf.int32),\n",
-    "        })\n",
-    "\n",
-    "        d = d.batch(batch_size=batch_size, drop_remainder=False)\n",
-    "        return d\n",
-    "\n",
-    "    return input_fn\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:35.671603Z",
-     "start_time": "2018-11-16T10:02:35.626167Z"
-    },
-    "code_folding": [
-     64,
-     77
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "def model_fn_builder(bert_config, init_checkpoint, learning_rate,\n",
-    "                     num_train_steps, num_warmup_steps, use_tpu,\n",
-    "                     use_one_hot_embeddings):\n",
-    "    \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
-    "\n",
-    "    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
-    "        \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
-    "\n",
-    "        tf.logging.info(\"*** Features ***\")\n",
-    "        for name in sorted(features.keys()):\n",
-    "            tf.logging.info(\"  name = %s, shape = %s\" % (name, features[name].shape))\n",
-    "\n",
-    "        input_ids = features[\"input_ids\"]\n",
-    "        input_mask = features[\"input_mask\"]\n",
-    "        segment_ids = features[\"segment_ids\"]\n",
-    "        masked_lm_positions = features[\"masked_lm_positions\"]\n",
-    "        masked_lm_ids = features[\"masked_lm_ids\"]\n",
-    "        masked_lm_weights = features[\"masked_lm_weights\"]\n",
-    "        next_sentence_labels = features[\"next_sentence_labels\"]\n",
-    "\n",
-    "        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
-    "\n",
-    "        model = tfm.BertModel(\n",
-    "            config=bert_config,\n",
-    "            is_training=is_training,\n",
-    "            input_ids=input_ids,\n",
-    "            input_mask=input_mask,\n",
-    "            token_type_ids=segment_ids,\n",
-    "            use_one_hot_embeddings=use_one_hot_embeddings)\n",
-    "\n",
-    "        (masked_lm_loss,\n",
-    "         masked_lm_example_loss, masked_lm_log_probs) = rp.get_masked_lm_output(\n",
-    "            bert_config, model.get_sequence_output(), model.get_embedding_table(),\n",
-    "            masked_lm_positions, masked_lm_ids, masked_lm_weights)\n",
-    "\n",
-    "        (next_sentence_loss, next_sentence_example_loss,\n",
-    "         next_sentence_log_probs) = rp.get_next_sentence_output(\n",
-    "            bert_config, model.get_pooled_output(), next_sentence_labels)\n",
-    "\n",
-    "        total_loss = masked_lm_loss + next_sentence_loss\n",
-    "\n",
-    "        tvars = tf.trainable_variables()\n",
-    "\n",
-    "        initialized_variable_names = {}\n",
-    "        scaffold_fn = None\n",
-    "        if init_checkpoint:\n",
-    "            (assignment_map,\n",
-    "             initialized_variable_names) = tfm.get_assigment_map_from_checkpoint(\n",
-    "                tvars, init_checkpoint)\n",
-    "            if use_tpu:\n",
-    "\n",
-    "                def tpu_scaffold():\n",
-    "                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "                    return tf.train.Scaffold()\n",
-    "\n",
-    "                scaffold_fn = tpu_scaffold\n",
-    "            else:\n",
-    "                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "\n",
-    "        tf.logging.info(\"**** Trainable Variables ****\")\n",
-    "        for var in tvars:\n",
-    "            init_string = \"\"\n",
-    "            if var.name in initialized_variable_names:\n",
-    "                init_string = \", *INIT_FROM_CKPT*\"\n",
-    "            tf.logging.info(\"  name = %s, shape = %s%s\", var.name, var.shape,\n",
-    "                            init_string)\n",
-    "\n",
-    "        output_spec = None\n",
-    "        if mode == tf.estimator.ModeKeys.TRAIN:\n",
-    "            masked_lm_positions = features[\"masked_lm_positions\"]\n",
-    "            masked_lm_ids = features[\"masked_lm_ids\"]\n",
-    "            masked_lm_weights = features[\"masked_lm_weights\"]\n",
-    "            next_sentence_labels = features[\"next_sentence_labels\"]\n",
-    "            train_op = optimization.create_optimizer(\n",
-    "                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\n",
-    "\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode,\n",
-    "                loss=total_loss,\n",
-    "                train_op=train_op,\n",
-    "                scaffold_fn=scaffold_fn)\n",
-    "        elif mode == tf.estimator.ModeKeys.EVAL:\n",
-    "            masked_lm_positions = features[\"masked_lm_positions\"]\n",
-    "            masked_lm_ids = features[\"masked_lm_ids\"]\n",
-    "            masked_lm_weights = features[\"masked_lm_weights\"]\n",
-    "            next_sentence_labels = features[\"next_sentence_labels\"]\n",
-    "\n",
-    "            def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\n",
-    "                          masked_lm_weights, next_sentence_example_loss,\n",
-    "                          next_sentence_log_probs, next_sentence_labels):\n",
-    "                \"\"\"Computes the loss and accuracy of the model.\"\"\"\n",
-    "                masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\n",
-    "                                                 [-1, masked_lm_log_probs.shape[-1]])\n",
-    "                masked_lm_predictions = tf.argmax(\n",
-    "                    masked_lm_log_probs, axis=-1, output_type=tf.int32)\n",
-    "                masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])\n",
-    "                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])\n",
-    "                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])\n",
-    "                masked_lm_accuracy = tf.metrics.accuracy(\n",
-    "                    labels=masked_lm_ids,\n",
-    "                    predictions=masked_lm_predictions,\n",
-    "                    weights=masked_lm_weights)\n",
-    "                masked_lm_mean_loss = tf.metrics.mean(\n",
-    "                    values=masked_lm_example_loss, weights=masked_lm_weights)\n",
-    "\n",
-    "                next_sentence_log_probs = tf.reshape(\n",
-    "                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\n",
-    "                next_sentence_predictions = tf.argmax(\n",
-    "                    next_sentence_log_probs, axis=-1, output_type=tf.int32)\n",
-    "                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])\n",
-    "                next_sentence_accuracy = tf.metrics.accuracy(\n",
-    "                    labels=next_sentence_labels, predictions=next_sentence_predictions)\n",
-    "                next_sentence_mean_loss = tf.metrics.mean(\n",
-    "                    values=next_sentence_example_loss)\n",
-    "\n",
-    "                return {\n",
-    "                    \"masked_lm_accuracy\": masked_lm_accuracy,\n",
-    "                    \"masked_lm_loss\": masked_lm_mean_loss,\n",
-    "                    \"next_sentence_accuracy\": next_sentence_accuracy,\n",
-    "                    \"next_sentence_loss\": next_sentence_mean_loss,\n",
-    "                }\n",
-    "\n",
-    "            eval_metrics = (metric_fn, [\n",
-    "                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\n",
-    "                masked_lm_weights, next_sentence_example_loss,\n",
-    "                next_sentence_log_probs, next_sentence_labels\n",
-    "            ])\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode,\n",
-    "                loss=total_loss,\n",
-    "                eval_metrics=eval_metrics,\n",
-    "                scaffold_fn=scaffold_fn)\n",
-    "        elif mode == tf.estimator.ModeKeys.PREDICT:\n",
-    "            masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\n",
-    "                                                [-1, masked_lm_log_probs.shape[-1]])\n",
-    "            masked_lm_predictions = tf.argmax(\n",
-    "                masked_lm_log_probs, axis=-1, output_type=tf.int32)\n",
-    "\n",
-    "            next_sentence_log_probs = tf.reshape(\n",
-    "                next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\n",
-    "            next_sentence_predictions = tf.argmax(\n",
-    "                next_sentence_log_probs, axis=-1, output_type=tf.int32)\n",
-    "\n",
-    "            masked_lm_predictions = tf.reshape(masked_lm_predictions,\n",
-    "                                                [1, masked_lm_positions.shape[-1]])\n",
-    "            next_sentence_predictions = tf.reshape(next_sentence_predictions,\n",
-    "                                                [1, 1])\n",
-    "\n",
-    "            predictions = {\n",
-    "                \"masked_lm_predictions\": masked_lm_predictions,\n",
-    "                \"next_sentence_predictions\": next_sentence_predictions\n",
-    "            }\n",
-    "\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\n",
-    "            return output_spec\n",
-    "        else:\n",
-    "            raise ValueError(\"Only TRAIN, EVAL and PREDICT modes are supported: %s\" % (mode))\n",
-    "\n",
-    "        return output_spec\n",
-    "\n",
-    "    return model_fn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:40.328700Z",
-     "start_time": "2018-11-16T10:02:36.289676Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   _TPUContext: eval_on_tpu True\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    master=None,\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        num_shards=1,\n",
-    "        per_host_input_for_training=is_per_host))\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    init_checkpoint=init_checkpoint,\n",
-    "    learning_rate=0,\n",
-    "    num_train_steps=1,\n",
-    "    num_warmup_steps=1,\n",
-    "    use_tpu=False,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "\n",
-    "# If TPU is not available, this will fall back to normal Estimator on CPU\n",
-    "# or GPU.\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=False,\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    predict_batch_size=1)\n",
-    "\n",
-    "input_fn = input_fn_builder(\n",
-    "    features=features, seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq,\n",
-    "tokenizer=tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:46.596956Z",
-     "start_time": "2018-11-16T10:02:40.331008Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Running infer on CPU\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Running infer on CPU\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Features ***\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   *** Features ***\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = input_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = input_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = input_mask, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = input_mask, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = masked_lm_ids, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_ids, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = masked_lm_positions, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_positions, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = masked_lm_weights, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_weights, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = next_sentence_labels, shape = (?, 1)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = next_sentence_labels, shape = (?, 1)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = segment_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = segment_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:**** Trainable Variables ****\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -   **** Trainable Variables ****\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Done calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -   Done calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Graph was finalized.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:44 - INFO - tensorflow -   Graph was finalized.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:45 - INFO - tensorflow -   Running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Done running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:45 - INFO - tensorflow -   Done running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_all_out = []\n",
-    "for result in estimator.predict(input_fn, yield_single_examples=True):\n",
-    "    tensorflow_all_out.append(result)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:46.634304Z",
-     "start_time": "2018-11-16T10:02:46.598800Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "2\n",
-      "dict_keys(['masked_lm_predictions', 'next_sentence_predictions'])\n",
-      "masked_lm_predictions [27227  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010\n",
-      "  1010  1010  1010  1010  1010  1010  1010  1010]\n",
-      "predicted token ['henson', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(tensorflow_all_out))\n",
-    "print(len(tensorflow_all_out[0]))\n",
-    "print(tensorflow_all_out[0].keys())\n",
-    "print(\"masked_lm_predictions\", tensorflow_all_out[0]['masked_lm_predictions'])\n",
-    "print(\"predicted token\", tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:46.671229Z",
-     "start_time": "2018-11-16T10:02:46.637102Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensorflow_output: ['henson']\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_outputs = tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions'])[:len(masked_lm_positions)]\n",
-    "print(\"tensorflow_output:\", tensorflow_outputs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2/ PyTorch code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:03.556557Z",
-     "start_time": "2018-11-16T10:03:03.519654Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from examples import extract_features\n",
-    "from examples.extract_features import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:03.952710Z",
-     "start_time": "2018-11-16T10:03:03.921917Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "init_checkpoint_pt = \"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.307673Z",
-     "start_time": "2018-11-16T10:03:04.439317Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"max_position_embeddings\": 512,\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"type_vocab_size\": 2,\n",
-      "  \"vocab_size\": 30522\n",
-      "}\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "BertForPreTraining(\n",
-       "  (bert): BertModel(\n",
-       "    (embeddings): BertEmbeddings(\n",
-       "      (word_embeddings): Embedding(30522, 768)\n",
-       "      (position_embeddings): Embedding(512, 768)\n",
-       "      (token_type_embeddings): Embedding(2, 768)\n",
-       "      (LayerNorm): BertLayerNorm()\n",
-       "      (dropout): Dropout(p=0.1)\n",
-       "    )\n",
-       "    (encoder): BertEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (pooler): BertPooler(\n",
-       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "      (activation): Tanh()\n",
-       "    )\n",
-       "  )\n",
-       "  (cls): BertPreTrainingHeads(\n",
-       "    (predictions): BertLMPredictionHead(\n",
-       "      (transform): BertPredictionHeadTransform(\n",
-       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "        (LayerNorm): BertLayerNorm()\n",
-       "      )\n",
-       "      (decoder): Linear(in_features=768, out_features=30522, bias=False)\n",
-       "    )\n",
-       "    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "device = torch.device(\"cpu\")\n",
-    "model = ppb.BertForPreTraining.from_pretrained('bert-base-uncased')\n",
-    "model.to(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.351625Z",
-     "start_time": "2018-11-16T10:03:12.310736Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BertForPreTraining(\n",
-       "  (bert): BertModel(\n",
-       "    (embeddings): BertEmbeddings(\n",
-       "      (word_embeddings): Embedding(30522, 768)\n",
-       "      (position_embeddings): Embedding(512, 768)\n",
-       "      (token_type_embeddings): Embedding(2, 768)\n",
-       "      (LayerNorm): BertLayerNorm()\n",
-       "      (dropout): Dropout(p=0.1)\n",
-       "    )\n",
-       "    (encoder): BertEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (pooler): BertPooler(\n",
-       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "      (activation): Tanh()\n",
-       "    )\n",
-       "  )\n",
-       "  (cls): BertPreTrainingHeads(\n",
-       "    (predictions): BertLMPredictionHead(\n",
-       "      (transform): BertPredictionHeadTransform(\n",
-       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "        (LayerNorm): BertLayerNorm()\n",
-       "      )\n",
-       "      (decoder): Linear(in_features=768, out_features=30522, bias=False)\n",
-       "    )\n",
-       "    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
-    "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
-    "all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n",
-    "all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in features], dtype=torch.long)\n",
-    "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_positions)\n",
-    "eval_sampler = SequentialSampler(eval_data)\n",
-    "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
-    "\n",
-    "model.eval()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.792741Z",
-     "start_time": "2018-11-16T10:03:12.354253Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[ 2040,  2001,  3958, 27227,  1029,  3958,   103,  2001,  1037, 13997,\n",
-      "         11510,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0]])\n",
-      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "(1, 20, 30522)\n",
-      "[27227, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "pytorch_all_out = []\n",
-    "for input_ids, input_mask, segment_ids, tensor_masked_lm_positions in eval_dataloader:\n",
-    "    print(input_ids)\n",
-    "    print(input_mask)\n",
-    "    print(segment_ids)\n",
-    "    input_ids = input_ids.to(device)\n",
-    "    input_mask = input_mask.to(device)\n",
-    "    segment_ids = segment_ids.to(device)\n",
-    "\n",
-    "    prediction_scores, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)\n",
-    "    prediction_scores = prediction_scores[0, tensor_masked_lm_positions].detach().cpu().numpy()\n",
-    "    print(prediction_scores.shape)\n",
-    "    masked_lm_predictions = np.argmax(prediction_scores, axis=-1).squeeze().tolist()\n",
-    "    print(masked_lm_predictions)\n",
-    "    pytorch_all_out.append(masked_lm_predictions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.828439Z",
-     "start_time": "2018-11-16T10:03:12.795420Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "pytorch_output: ['henson']\n",
-      "tensorflow_output: ['henson']\n"
-     ]
-    }
-   ],
-   "source": [
-    "pytorch_outputs = tokenizer.convert_ids_to_tokens(pytorch_all_out[0])[:len(masked_lm_positions)]\n",
-    "print(\"pytorch_output:\", pytorch_outputs)\n",
-    "print(\"tensorflow_output:\", tensorflow_outputs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python [default]",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
-  },
-  "toc": {
-   "colors": {
-    "hover_highlight": "#DAA520",
-    "running_highlight": "#FF0000",
-    "selected_highlight": "#FFD700"
-   },
-   "moveMenuLeft": true,
-   "nav_menu": {
-    "height": "48px",
-    "width": "252px"
-   },
-   "navigate_menu": true,
-   "number_sections": true,
-   "sideBar": true,
-   "threshold": 4,
-   "toc_cell": false,
-   "toc_section_display": "block",
-   "toc_window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb b/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
deleted file mode 100644
index a75e052643..0000000000
--- a/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
+++ /dev/null
@@ -1,1644 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing TensorFlow (original) and PyTorch model on the SQuAD task\n",
-    "\n",
-    "You can use this small notebook to check the loss computation from the TensorFlow model to the PyTorch model. In the following, we compare the total loss computed by the models starting from identical initializations (position prediction linear layers with weights at 1 and bias at 0).\n",
-    "\n",
-    "To run this notebook, follow these instructions:\n",
-    "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n",
-    "- download the original TensorFlow implementation,\n",
-    "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n",
-    "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n",
-    "\n",
-    "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:33.636911Z",
-     "start_time": "2018-11-06T10:11:33.623091Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.chdir('../')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1/ TensorFlow code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:33.651792Z",
-     "start_time": "2018-11-06T10:11:33.638984Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "original_tf_inplem_dir = \"./tensorflow_code/\"\n",
-    "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n",
-    "\n",
-    "vocab_file = model_dir + \"vocab.txt\"\n",
-    "bert_config_file = model_dir + \"bert_config.json\"\n",
-    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
-    "\n",
-    "input_file = \"../data/squad_data/train-v1.1.json\"\n",
-    "max_seq_length = 384\n",
-    "outside_pos = max_seq_length + 10\n",
-    "doc_stride = 128\n",
-    "max_query_length = 64\n",
-    "max_answer_length = 30\n",
-    "output_dir = \"/tmp/squad_base/\"\n",
-    "learning_rate = 3e-5"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:35.165788Z",
-     "start_time": "2018-11-06T10:11:33.653401Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import importlib.util\n",
-    "import sys\n",
-    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/modeling.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['modeling_tensorflow'] = module\n",
-    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_bert_squad.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['run_squad_tensorflow'] = module\n",
-    "import modeling_tensorflow\n",
-    "from run_squad_tensorflow import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.494391Z",
-     "start_time": "2018-11-06T10:11:35.168615Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000000\n",
-      "INFO:tensorflow:example_index: 0\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] to whom did the virgin mary allegedly appear in 1858 in lou ##rdes france ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 17:0 18:0 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:6 27:7 28:8 29:9 30:10 31:10 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:27 51:28 52:29 53:30 54:30 55:31 56:32 57:33 58:34 59:35 60:36 61:37 62:38 63:39 64:39 65:39 66:40 67:41 68:42 69:43 70:43 71:43 72:43 73:44 74:45 75:46 76:46 77:46 78:46 79:47 80:48 81:49 82:50 83:51 84:52 85:53 86:54 87:55 88:56 89:57 90:58 91:58 92:59 93:60 94:61 95:62 96:63 97:64 98:65 99:65 100:65 101:66 102:67 103:68 104:69 105:70 106:71 107:72 108:72 109:73 110:74 111:75 112:76 113:77 114:78 115:79 116:79 117:80 118:81 119:81 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:92 136:92 137:92 138:93 139:94 140:94 141:95 142:96 143:97 144:98 145:99 146:100 147:101 148:102 149:102 150:103 151:104 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:115 164:115 165:116 166:117 167:118 168:118 169:119 170:120 171:121 172:122 173:123 174:123\n",
-      "INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True\n",
-      "INFO:tensorflow:input_ids: 101 2000 3183 2106 1996 6261 2984 9382 3711 1999 8517 1999 10223 26371 2605 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 130\n",
-      "INFO:tensorflow:end_position: 137\n",
-      "INFO:tensorflow:answer: saint bern ##ade ##tte so ##ub ##iro ##us\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000001\n",
-      "INFO:tensorflow:example_index: 1\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is in front of the notre dame main building ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:0 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:6 23:7 24:8 25:9 26:10 27:10 28:10 29:11 30:12 31:13 32:14 33:15 34:16 35:17 36:18 37:19 38:20 39:20 40:21 41:22 42:23 43:24 44:25 45:26 46:27 47:28 48:29 49:30 50:30 51:31 52:32 53:33 54:34 55:35 56:36 57:37 58:38 59:39 60:39 61:39 62:40 63:41 64:42 65:43 66:43 67:43 68:43 69:44 70:45 71:46 72:46 73:46 74:46 75:47 76:48 77:49 78:50 79:51 80:52 81:53 82:54 83:55 84:56 85:57 86:58 87:58 88:59 89:60 90:61 91:62 92:63 93:64 94:65 95:65 96:65 97:66 98:67 99:68 100:69 101:70 102:71 103:72 104:72 105:73 106:74 107:75 108:76 109:77 110:78 111:79 112:79 113:80 114:81 115:81 116:81 117:82 118:83 119:84 120:85 121:86 122:87 123:87 124:88 125:89 126:90 127:91 128:91 129:91 130:92 131:92 132:92 133:92 134:93 135:94 136:94 137:95 138:96 139:97 140:98 141:99 142:100 143:101 144:102 145:102 146:103 147:104 148:105 149:106 150:107 151:108 152:109 153:110 154:111 155:112 156:113 157:114 158:115 159:115 160:115 161:116 162:117 163:118 164:118 165:119 166:120 167:121 168:122 169:123 170:123\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1999 2392 1997 1996 10289 8214 2364 2311 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 52\n",
-      "INFO:tensorflow:end_position: 56\n",
-      "INFO:tensorflow:answer: a copper statue of christ\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000002\n",
-      "INFO:tensorflow:example_index: 2\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] the basilica of the sacred heart at notre dame is beside to which structure ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 17:0 18:0 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:6 27:7 28:8 29:9 30:10 31:10 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:27 51:28 52:29 53:30 54:30 55:31 56:32 57:33 58:34 59:35 60:36 61:37 62:38 63:39 64:39 65:39 66:40 67:41 68:42 69:43 70:43 71:43 72:43 73:44 74:45 75:46 76:46 77:46 78:46 79:47 80:48 81:49 82:50 83:51 84:52 85:53 86:54 87:55 88:56 89:57 90:58 91:58 92:59 93:60 94:61 95:62 96:63 97:64 98:65 99:65 100:65 101:66 102:67 103:68 104:69 105:70 106:71 107:72 108:72 109:73 110:74 111:75 112:76 113:77 114:78 115:79 116:79 117:80 118:81 119:81 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:92 136:92 137:92 138:93 139:94 140:94 141:95 142:96 143:97 144:98 145:99 146:100 147:101 148:102 149:102 150:103 151:104 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:115 164:115 165:116 166:117 167:118 168:118 169:119 170:120 171:121 172:122 173:123 174:123\n",
-      "INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True\n",
-      "INFO:tensorflow:input_ids: 101 1996 13546 1997 1996 6730 2540 2012 10289 8214 2003 3875 2000 2029 3252 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 81\n",
-      "INFO:tensorflow:end_position: 83\n",
-      "INFO:tensorflow:answer: the main building\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000003\n",
-      "INFO:tensorflow:example_index: 3\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the gr ##otto at notre dame ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 11:0 12:0 13:0 14:1 15:2 16:3 17:4 18:5 19:6 20:6 21:7 22:8 23:9 24:10 25:10 26:10 27:11 28:12 29:13 30:14 31:15 32:16 33:17 34:18 35:19 36:20 37:20 38:21 39:22 40:23 41:24 42:25 43:26 44:27 45:28 46:29 47:30 48:30 49:31 50:32 51:33 52:34 53:35 54:36 55:37 56:38 57:39 58:39 59:39 60:40 61:41 62:42 63:43 64:43 65:43 66:43 67:44 68:45 69:46 70:46 71:46 72:46 73:47 74:48 75:49 76:50 77:51 78:52 79:53 80:54 81:55 82:56 83:57 84:58 85:58 86:59 87:60 88:61 89:62 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:72 103:73 104:74 105:75 106:76 107:77 108:78 109:79 110:79 111:80 112:81 113:81 114:81 115:82 116:83 117:84 118:85 119:86 120:87 121:87 122:88 123:89 124:90 125:91 126:91 127:91 128:92 129:92 130:92 131:92 132:93 133:94 134:94 135:95 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:102 144:103 145:104 146:105 147:106 148:107 149:108 150:109 151:110 152:111 153:112 154:113 155:114 156:115 157:115 158:115 159:116 160:117 161:118 162:118 163:119 164:120 165:121 166:122 167:123 168:123\n",
-      "INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 24665 23052 2012 10289 8214 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 95\n",
-      "INFO:tensorflow:end_position: 101\n",
-      "INFO:tensorflow:answer: a marian place of prayer and reflection\n",
-      "INFO:tensorflow:*** Example ***\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:unique_id: 1000000004\n",
-      "INFO:tensorflow:example_index: 4\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what sits on top of the main building at notre dame ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 14:0 15:0 16:0 17:1 18:2 19:3 20:4 21:5 22:6 23:6 24:7 25:8 26:9 27:10 28:10 29:10 30:11 31:12 32:13 33:14 34:15 35:16 36:17 37:18 38:19 39:20 40:20 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:28 49:29 50:30 51:30 52:31 53:32 54:33 55:34 56:35 57:36 58:37 59:38 60:39 61:39 62:39 63:40 64:41 65:42 66:43 67:43 68:43 69:43 70:44 71:45 72:46 73:46 74:46 75:46 76:47 77:48 78:49 79:50 80:51 81:52 82:53 83:54 84:55 85:56 86:57 87:58 88:58 89:59 90:60 91:61 92:62 93:63 94:64 95:65 96:65 97:65 98:66 99:67 100:68 101:69 102:70 103:71 104:72 105:72 106:73 107:74 108:75 109:76 110:77 111:78 112:79 113:79 114:80 115:81 116:81 117:81 118:82 119:83 120:84 121:85 122:86 123:87 124:87 125:88 126:89 127:90 128:91 129:91 130:91 131:92 132:92 133:92 134:92 135:93 136:94 137:94 138:95 139:96 140:97 141:98 142:99 143:100 144:101 145:102 146:102 147:103 148:104 149:105 150:106 151:107 152:108 153:109 154:110 155:111 156:112 157:113 158:114 159:115 160:115 161:115 162:116 163:117 164:118 165:118 166:119 167:120 168:121 169:122 170:123 171:123\n",
-      "INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 7719 2006 2327 1997 1996 2364 2311 2012 10289 8214 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 33\n",
-      "INFO:tensorflow:end_position: 39\n",
-      "INFO:tensorflow:answer: a golden statue of the virgin mary\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000005\n",
-      "INFO:tensorflow:example_index: 5\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] when did the scholastic magazine of notre dame begin publishing ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\n",
-      "INFO:tensorflow:input_ids: 101 2043 2106 1996 24105 2932 1997 10289 8214 4088 4640 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 63\n",
-      "INFO:tensorflow:end_position: 64\n",
-      "INFO:tensorflow:answer: september 1876\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000006\n",
-      "INFO:tensorflow:example_index: 6\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how often is notre dame ' s the jug ##gler published ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 14:0 15:1 16:2 17:3 18:4 19:4 20:5 21:6 22:6 23:6 24:7 25:8 26:9 27:10 28:11 29:12 30:13 31:14 32:14 33:15 34:16 35:17 36:17 37:17 38:18 39:19 40:20 41:21 42:21 43:22 44:23 45:24 46:25 47:26 48:27 49:27 50:28 51:29 52:30 53:31 54:32 55:32 56:33 57:34 58:35 59:36 60:36 61:36 62:37 63:38 64:39 65:40 66:40 67:41 68:42 69:43 70:44 71:45 72:46 73:47 74:48 75:49 76:50 77:51 78:52 79:53 80:54 81:55 82:56 83:57 84:58 85:59 86:60 87:60 88:61 89:62 90:63 91:63 92:64 93:65 94:65 95:65 96:66 97:67 98:68 99:69 100:70 101:71 102:72 103:73 104:74 105:75 106:76 107:77 108:77 109:78 110:79 111:80 112:81 113:82 114:83 115:83 116:84 117:85 118:86 119:87 120:88 121:89 122:89 123:90 124:91 125:92 126:93 127:94 128:95 129:96 130:97 131:98 132:99 133:100 134:101 135:101 136:102 137:103 138:104 139:105 140:106 141:107 142:108 143:109 144:110 145:111 146:112 147:112 148:112 149:113 150:113 151:114 152:115 153:116 154:117 155:118 156:118 157:119 158:120 159:121 160:122 161:123 162:124 163:125 164:126 165:127 166:128 167:129 168:130 169:131 170:132 171:133 172:134 173:135 174:136 175:137 176:138 177:138 178:139 179:140 180:140 181:141 182:142 183:143 184:144 185:145 186:146 187:147 188:148 189:149 190:150 191:151 192:152 193:153 194:153 195:154 196:155 197:156 198:156 199:157 200:158 201:159 202:160 203:160 204:161 205:161 206:162 207:163 208:163 209:164 210:165 211:166 212:167 213:168 214:169 215:170 216:171 217:172 218:173 219:174 220:174 221:175 222:176 223:177 224:178 225:179 226:180 227:181 228:182 229:182 230:183 231:184 232:185 233:186 234:187 235:188 236:189 237:190 238:191 239:191 240:192 241:192 242:193 243:194 244:195 245:196 246:197 247:198 248:199 249:199 250:200 251:200 252:201 253:202 254:203 255:204 256:205 257:206 258:207 259:208 260:209 261:210 262:210 263:211 264:212 265:212 266:213 267:214 268:215 269:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True 269:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2411 2003 10289 8214 1005 1055 1996 26536 17420 2405 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 98\n",
-      "INFO:tensorflow:end_position: 98\n",
-      "INFO:tensorflow:answer: twice\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000007\n",
-      "INFO:tensorflow:example_index: 7\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the daily student paper at notre dame called ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 3679 3076 3259 2012 10289 8214 2170 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 123\n",
-      "INFO:tensorflow:end_position: 124\n",
-      "INFO:tensorflow:answer: the observer\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000008\n",
-      "INFO:tensorflow:example_index: 8\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how many student news papers are found at notre dame ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2116 3076 2739 4981 2024 2179 2012 10289 8214 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 39\n",
-      "INFO:tensorflow:end_position: 39\n",
-      "INFO:tensorflow:answer: three\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000009\n",
-      "INFO:tensorflow:example_index: 9\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] in what year did the student paper common sense begin publication at notre dame ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 17:0 18:1 19:2 20:3 21:4 22:4 23:5 24:6 25:6 26:6 27:7 28:8 29:9 30:10 31:11 32:12 33:13 34:14 35:14 36:15 37:16 38:17 39:17 40:17 41:18 42:19 43:20 44:21 45:21 46:22 47:23 48:24 49:25 50:26 51:27 52:27 53:28 54:29 55:30 56:31 57:32 58:32 59:33 60:34 61:35 62:36 63:36 64:36 65:37 66:38 67:39 68:40 69:40 70:41 71:42 72:43 73:44 74:45 75:46 76:47 77:48 78:49 79:50 80:51 81:52 82:53 83:54 84:55 85:56 86:57 87:58 88:59 89:60 90:60 91:61 92:62 93:63 94:63 95:64 96:65 97:65 98:65 99:66 100:67 101:68 102:69 103:70 104:71 105:72 106:73 107:74 108:75 109:76 110:77 111:77 112:78 113:79 114:80 115:81 116:82 117:83 118:83 119:84 120:85 121:86 122:87 123:88 124:89 125:89 126:90 127:91 128:92 129:93 130:94 131:95 132:96 133:97 134:98 135:99 136:100 137:101 138:101 139:102 140:103 141:104 142:105 143:106 144:107 145:108 146:109 147:110 148:111 149:112 150:112 151:112 152:113 153:113 154:114 155:115 156:116 157:117 158:118 159:118 160:119 161:120 162:121 163:122 164:123 165:124 166:125 167:126 168:127 169:128 170:129 171:130 172:131 173:132 174:133 175:134 176:135 177:136 178:137 179:138 180:138 181:139 182:140 183:140 184:141 185:142 186:143 187:144 188:145 189:146 190:147 191:148 192:149 193:150 194:151 195:152 196:153 197:153 198:154 199:155 200:156 201:156 202:157 203:158 204:159 205:160 206:160 207:161 208:161 209:162 210:163 211:163 212:164 213:165 214:166 215:167 216:168 217:169 218:170 219:171 220:172 221:173 222:174 223:174 224:175 225:176 226:177 227:178 228:179 229:180 230:181 231:182 232:182 233:183 234:184 235:185 236:186 237:187 238:188 239:189 240:190 241:191 242:191 243:192 244:192 245:193 246:194 247:195 248:196 249:197 250:198 251:199 252:199 253:200 254:200 255:201 256:202 257:203 258:204 259:205 260:206 261:207 262:208 263:209 264:210 265:210 266:211 267:212 268:212 269:213 270:214 271:215 272:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True 269:True 270:True 271:True 272:True\n",
-      "INFO:tensorflow:input_ids: 101 1999 2054 2095 2106 1996 3076 3259 2691 3168 4088 4772 2012 10289 8214 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 182\n",
-      "INFO:tensorflow:end_position: 182\n",
-      "INFO:tensorflow:answer: 1987\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000010\n",
-      "INFO:tensorflow:example_index: 10\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] where is the headquarters of the congregation of the holy cross ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 14:0 15:1 16:2 17:3 18:4 19:5 20:6 21:7 22:8 23:9 24:10 25:11 26:12 27:12 28:13 29:14 30:15 31:16 32:16 33:17 34:18 35:19 36:20 37:20 38:20 39:21 40:22 41:23 42:23 43:24 44:24 45:25 46:25 47:26 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:37 60:38 61:38 62:39 63:40 64:40 65:41 66:42 67:43 68:44 69:45 70:46 71:47 72:48 73:49 74:50 75:51 76:52 77:52 78:53 79:54 80:54 81:55 82:56 83:57 84:57 85:57 86:58 87:59 88:60 89:61 90:62 91:63 92:64 93:65 94:66 95:66 96:67 97:68 98:69 99:69 100:69 101:70 102:71 103:72 104:72 105:73 106:74 107:75 108:76 109:76 110:76 111:77 112:78 113:79 114:80 115:80 116:80 117:81 118:82 119:83 120:84 121:85 122:85 123:86 124:87 125:88 126:89 127:90 128:91 129:92 130:92 131:92 132:92 133:93 134:94 135:95 136:95 137:96 138:96 139:96 140:97 141:98 142:99 143:100 144:101 145:102 146:103 147:104 148:104 149:105 150:106 151:107 152:108 153:108 154:108 155:109 156:110 157:111 158:111\n",
-      "INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_ids: 101 2073 2003 1996 4075 1997 1996 7769 1997 1996 4151 2892 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 36\n",
-      "INFO:tensorflow:end_position: 36\n",
-      "INFO:tensorflow:answer: rome\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000011\n",
-      "INFO:tensorflow:example_index: 11\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the primary seminary of the congregation of the holy cross ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:7 23:8 24:9 25:10 26:11 27:12 28:12 29:13 30:14 31:15 32:16 33:16 34:17 35:18 36:19 37:20 38:20 39:20 40:21 41:22 42:23 43:23 44:24 45:24 46:25 47:25 48:26 49:27 50:28 51:29 52:30 53:31 54:32 55:32 56:33 57:34 58:35 59:36 60:37 61:38 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:52 79:53 80:54 81:54 82:55 83:56 84:57 85:57 86:57 87:58 88:59 89:60 90:61 91:62 92:63 93:64 94:65 95:66 96:66 97:67 98:68 99:69 100:69 101:69 102:70 103:71 104:72 105:72 106:73 107:74 108:75 109:76 110:76 111:76 112:77 113:78 114:79 115:80 116:80 117:80 118:81 119:82 120:83 121:84 122:85 123:85 124:86 125:87 126:88 127:89 128:90 129:91 130:92 131:92 132:92 133:92 134:93 135:94 136:95 137:95 138:96 139:96 140:96 141:97 142:98 143:99 144:100 145:101 146:102 147:103 148:104 149:104 150:105 151:106 152:107 153:108 154:108 155:108 156:109 157:110 158:111 159:111\n",
-      "INFO:tensorflow:token_is_max_context: 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 3078 8705 1997 1996 7769 1997 1996 4151 2892 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 44\n",
-      "INFO:tensorflow:end_position: 46\n",
-      "INFO:tensorflow:answer: more ##au seminary\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000012\n",
-      "INFO:tensorflow:example_index: 12\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the oldest structure at notre dame ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:6 18:7 19:8 20:9 21:10 22:11 23:12 24:12 25:13 26:14 27:15 28:16 29:16 30:17 31:18 32:19 33:20 34:20 35:20 36:21 37:22 38:23 39:23 40:24 41:24 42:25 43:25 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:32 52:33 53:34 54:35 55:36 56:37 57:38 58:38 59:39 60:40 61:40 62:41 63:42 64:43 65:44 66:45 67:46 68:47 69:48 70:49 71:50 72:51 73:52 74:52 75:53 76:54 77:54 78:55 79:56 80:57 81:57 82:57 83:58 84:59 85:60 86:61 87:62 88:63 89:64 90:65 91:66 92:66 93:67 94:68 95:69 96:69 97:69 98:70 99:71 100:72 101:72 102:73 103:74 104:75 105:76 106:76 107:76 108:77 109:78 110:79 111:80 112:80 113:80 114:81 115:82 116:83 117:84 118:85 119:85 120:86 121:87 122:88 123:89 124:90 125:91 126:92 127:92 128:92 129:92 130:93 131:94 132:95 133:95 134:96 135:96 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:103 144:104 145:104 146:105 147:106 148:107 149:108 150:108 151:108 152:109 153:110 154:111 155:111\n",
-      "INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 4587 3252 2012 10289 8214 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 59\n",
-      "INFO:tensorflow:end_position: 60\n",
-      "INFO:tensorflow:answer: old college\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000013\n",
-      "INFO:tensorflow:example_index: 13\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what individuals live at fatima house at notre dame ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 12:0 13:1 14:2 15:3 16:4 17:5 18:6 19:7 20:8 21:9 22:10 23:11 24:12 25:12 26:13 27:14 28:15 29:16 30:16 31:17 32:18 33:19 34:20 35:20 36:20 37:21 38:22 39:23 40:23 41:24 42:24 43:25 44:25 45:26 46:27 47:28 48:29 49:30 50:31 51:32 52:32 53:33 54:34 55:35 56:36 57:37 58:38 59:38 60:39 61:40 62:40 63:41 64:42 65:43 66:44 67:45 68:46 69:47 70:48 71:49 72:50 73:51 74:52 75:52 76:53 77:54 78:54 79:55 80:56 81:57 82:57 83:57 84:58 85:59 86:60 87:61 88:62 89:63 90:64 91:65 92:66 93:66 94:67 95:68 96:69 97:69 98:69 99:70 100:71 101:72 102:72 103:73 104:74 105:75 106:76 107:76 108:76 109:77 110:78 111:79 112:80 113:80 114:80 115:81 116:82 117:83 118:84 119:85 120:85 121:86 122:87 123:88 124:89 125:90 126:91 127:92 128:92 129:92 130:92 131:93 132:94 133:95 134:95 135:96 136:96 137:96 138:97 139:98 140:99 141:100 142:101 143:102 144:103 145:104 146:104 147:105 148:106 149:107 150:108 151:108 152:108 153:109 154:110 155:111 156:111\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 3633 2444 2012 27596 2160 2012 10289 8214 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 84\n",
-      "INFO:tensorflow:end_position: 87\n",
-      "INFO:tensorflow:answer: retired priests and brothers\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000014\n",
-      "INFO:tensorflow:example_index: 14\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] which prize did frederick bu ##ech ##ner create ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:6 18:7 19:8 20:9 21:10 22:11 23:12 24:12 25:13 26:14 27:15 28:16 29:16 30:17 31:18 32:19 33:20 34:20 35:20 36:21 37:22 38:23 39:23 40:24 41:24 42:25 43:25 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:32 52:33 53:34 54:35 55:36 56:37 57:38 58:38 59:39 60:40 61:40 62:41 63:42 64:43 65:44 66:45 67:46 68:47 69:48 70:49 71:50 72:51 73:52 74:52 75:53 76:54 77:54 78:55 79:56 80:57 81:57 82:57 83:58 84:59 85:60 86:61 87:62 88:63 89:64 90:65 91:66 92:66 93:67 94:68 95:69 96:69 97:69 98:70 99:71 100:72 101:72 102:73 103:74 104:75 105:76 106:76 107:76 108:77 109:78 110:79 111:80 112:80 113:80 114:81 115:82 116:83 117:84 118:85 119:85 120:86 121:87 122:88 123:89 124:90 125:91 126:92 127:92 128:92 129:92 130:93 131:94 132:95 133:95 134:96 135:96 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:103 144:104 145:104 146:105 147:106 148:107 149:108 150:108 151:108 152:109 153:110 154:111 155:111\n",
-      "INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\n",
-      "INFO:tensorflow:input_ids: 101 2029 3396 2106 5406 20934 15937 3678 3443 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 149\n",
-      "INFO:tensorflow:end_position: 154\n",
-      "INFO:tensorflow:answer: bu ##ech ##ner prize for preaching\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000015\n",
-      "INFO:tensorflow:example_index: 15\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how many bs level degrees are offered in the college of engineering at notre dame ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 18:0 19:1 20:2 21:3 22:4 23:5 24:6 25:7 26:7 27:8 28:8 29:9 30:10 31:11 32:12 33:13 34:14 35:15 36:16 37:17 38:18 39:19 40:20 41:21 42:22 43:23 44:24 45:25 46:26 47:26 48:27 49:28 50:29 51:29 52:30 53:31 54:32 55:33 56:33 57:34 58:34 59:34 60:35 61:36 62:36 63:36 64:36 65:36 66:36 67:36 68:37 69:38 70:39 71:39 72:40 73:41 74:42 75:43 76:44 77:45 78:46 79:47 80:48 81:49 82:49 83:50 84:51 85:52 86:52 87:52 88:52 89:53 90:53 91:54 92:55 93:56 94:57 95:58 96:58 97:59 98:60 99:61 100:62 101:62 102:63 103:64 104:65 105:66 106:67 107:68 108:69 109:69 110:69 111:69 112:70 113:71 114:71 115:72 116:72 117:73 118:74 119:75 120:76 121:76 122:76 123:77 124:78 125:79 126:80 127:81 128:82 129:83 130:84 131:85 132:86 133:87 134:88 135:89 136:90 137:91 138:92 139:92 140:92 141:92 142:93 143:94 144:95 145:96 146:97 147:98 148:98 149:98 150:99 151:99 152:100 153:100\n",
-      "INFO:tensorflow:token_is_max_context: 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2116 18667 2504 5445 2024 3253 1999 1996 2267 1997 3330 2012 10289 8214 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 107\n",
-      "INFO:tensorflow:end_position: 107\n",
-      "INFO:tensorflow:answer: eight\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000016\n",
-      "INFO:tensorflow:example_index: 16\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] in what year was the college of engineering at notre dame formed ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_to_orig_map: 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:7 23:7 24:8 25:8 26:9 27:10 28:11 29:12 30:13 31:14 32:15 33:16 34:17 35:18 36:19 37:20 38:21 39:22 40:23 41:24 42:25 43:26 44:26 45:27 46:28 47:29 48:29 49:30 50:31 51:32 52:33 53:33 54:34 55:34 56:34 57:35 58:36 59:36 60:36 61:36 62:36 63:36 64:36 65:37 66:38 67:39 68:39 69:40 70:41 71:42 72:43 73:44 74:45 75:46 76:47 77:48 78:49 79:49 80:50 81:51 82:52 83:52 84:52 85:52 86:53 87:53 88:54 89:55 90:56 91:57 92:58 93:58 94:59 95:60 96:61 97:62 98:62 99:63 100:64 101:65 102:66 103:67 104:68 105:69 106:69 107:69 108:69 109:70 110:71 111:71 112:72 113:72 114:73 115:74 116:75 117:76 118:76 119:76 120:77 121:78 122:79 123:80 124:81 125:82 126:83 127:84 128:85 129:86 130:87 131:88 132:89 133:90 134:91 135:92 136:92 137:92 138:92 139:93 140:94 141:95 142:96 143:97 144:98 145:98 146:98 147:99 148:99 149:100 150:100\n",
-      "INFO:tensorflow:token_is_max_context: 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True\n",
-      "INFO:tensorflow:input_ids: 101 1999 2054 2095 2001 1996 2267 1997 3330 2012 10289 8214 2719 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 22\n",
-      "INFO:tensorflow:end_position: 22\n",
-      "INFO:tensorflow:answer: 1920\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000017\n",
-      "INFO:tensorflow:example_index: 17\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] before the creation of the college of engineering similar studies were carried out at which notre dame college ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 21:0 22:1 23:2 24:3 25:4 26:5 27:6 28:7 29:7 30:8 31:8 32:9 33:10 34:11 35:12 36:13 37:14 38:15 39:16 40:17 41:18 42:19 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:26 51:27 52:28 53:29 54:29 55:30 56:31 57:32 58:33 59:33 60:34 61:34 62:34 63:35 64:36 65:36 66:36 67:36 68:36 69:36 70:36 71:37 72:38 73:39 74:39 75:40 76:41 77:42 78:43 79:44 80:45 81:46 82:47 83:48 84:49 85:49 86:50 87:51 88:52 89:52 90:52 91:52 92:53 93:53 94:54 95:55 96:56 97:57 98:58 99:58 100:59 101:60 102:61 103:62 104:62 105:63 106:64 107:65 108:66 109:67 110:68 111:69 112:69 113:69 114:69 115:70 116:71 117:71 118:72 119:72 120:73 121:74 122:75 123:76 124:76 125:76 126:77 127:78 128:79 129:80 130:81 131:82 132:83 133:84 134:85 135:86 136:87 137:88 138:89 139:90 140:91 141:92 142:92 143:92 144:92 145:93 146:94 147:95 148:96 149:97 150:98 151:98 152:98 153:99 154:99 155:100 156:100\n",
-      "INFO:tensorflow:token_is_max_context: 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True\n",
-      "INFO:tensorflow:input_ids: 101 2077 1996 4325 1997 1996 2267 1997 3330 2714 2913 2020 3344 2041 2012 2029 10289 8214 2267 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 43\n",
-      "INFO:tensorflow:end_position: 46\n",
-      "INFO:tensorflow:answer: the college of science\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000018\n",
-      "INFO:tensorflow:example_index: 18\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how many departments are within the st ##ins ##on - re ##mic ##k hall of engineering ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:7 27:7 28:8 29:8 30:9 31:10 32:11 33:12 34:13 35:14 36:15 37:16 38:17 39:18 40:19 41:20 42:21 43:22 44:23 45:24 46:25 47:26 48:26 49:27 50:28 51:29 52:29 53:30 54:31 55:32 56:33 57:33 58:34 59:34 60:34 61:35 62:36 63:36 64:36 65:36 66:36 67:36 68:36 69:37 70:38 71:39 72:39 73:40 74:41 75:42 76:43 77:44 78:45 79:46 80:47 81:48 82:49 83:49 84:50 85:51 86:52 87:52 88:52 89:52 90:53 91:53 92:54 93:55 94:56 95:57 96:58 97:58 98:59 99:60 100:61 101:62 102:62 103:63 104:64 105:65 106:66 107:67 108:68 109:69 110:69 111:69 112:69 113:70 114:71 115:71 116:72 117:72 118:73 119:74 120:75 121:76 122:76 123:76 124:77 125:78 126:79 127:80 128:81 129:82 130:83 131:84 132:85 133:86 134:87 135:88 136:89 137:90 138:91 139:92 140:92 141:92 142:92 143:93 144:94 145:95 146:96 147:97 148:98 149:98 150:98 151:99 152:99 153:100 154:100\n",
-      "INFO:tensorflow:token_is_max_context: 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2116 7640 2024 2306 1996 2358 7076 2239 1011 2128 7712 2243 2534 1997 3330 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 74\n",
-      "INFO:tensorflow:end_position: 74\n",
-      "INFO:tensorflow:answer: five\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000019\n",
-      "INFO:tensorflow:example_index: 19\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] the college of science began to offer civil engineering courses beginning at what time at notre dame ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_to_orig_map: 20:0 21:1 22:2 23:3 24:4 25:5 26:6 27:7 28:7 29:8 30:8 31:9 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:21 44:22 45:23 46:24 47:25 48:26 49:26 50:27 51:28 52:29 53:29 54:30 55:31 56:32 57:33 58:33 59:34 60:34 61:34 62:35 63:36 64:36 65:36 66:36 67:36 68:36 69:36 70:37 71:38 72:39 73:39 74:40 75:41 76:42 77:43 78:44 79:45 80:46 81:47 82:48 83:49 84:49 85:50 86:51 87:52 88:52 89:52 90:52 91:53 92:53 93:54 94:55 95:56 96:57 97:58 98:58 99:59 100:60 101:61 102:62 103:62 104:63 105:64 106:65 107:66 108:67 109:68 110:69 111:69 112:69 113:69 114:70 115:71 116:71 117:72 118:72 119:73 120:74 121:75 122:76 123:76 124:76 125:77 126:78 127:79 128:80 129:81 130:82 131:83 132:84 133:85 134:86 135:87 136:88 137:89 138:90 139:91 140:92 141:92 142:92 143:92 144:93 145:94 146:95 147:96 148:97 149:98 150:98 151:98 152:99 153:99 154:100 155:100\n",
-      "INFO:tensorflow:token_is_max_context: 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\n",
-      "INFO:tensorflow:input_ids: 101 1996 2267 1997 2671 2211 2000 3749 2942 3330 5352 2927 2012 2054 2051 2012 10289 8214 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 47\n",
-      "INFO:tensorflow:end_position: 48\n",
-      "INFO:tensorflow:answer: the 1870s\n"
-     ]
-    }
-   ],
-   "source": [
-    "bert_config = modeling_tensorflow.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = tokenization.BertTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "\n",
-    "eval_examples = read_squad_examples(\n",
-    "    input_file=input_file, is_training=True, max_num=16)\n",
-    "\n",
-    "eval_features = convert_examples_to_features(\n",
-    "    examples=eval_examples,\n",
-    "    tokenizer=tokenizer,\n",
-    "    max_seq_length=max_seq_length,\n",
-    "    doc_stride=doc_stride,\n",
-    "    max_query_length=max_query_length,\n",
-    "    is_training=True)\n",
-    "\n",
-    "# You can use that to test the behavior of the models when target are outside of the model input sequence\n",
-    "# for feature in eval_features:\n",
-    "#     feature.start_position = outside_pos\n",
-    "#     feature.end_position = outside_pos"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.525632Z",
-     "start_time": "2018-11-06T10:11:37.498695Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "eval_unique_id_to_feature = {}\n",
-    "for eval_feature in eval_features:\n",
-    "    eval_unique_id_to_feature[eval_feature.unique_id] = eval_feature"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.558325Z",
-     "start_time": "2018-11-06T10:11:37.527972Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def input_fn_builder(features, seq_length, drop_remainder):\n",
-    "    \"\"\"Creates an `input_fn` closure to be passed to TPUEstimator.\"\"\"\n",
-    "\n",
-    "    all_unique_ids = []\n",
-    "    all_input_ids = []\n",
-    "    all_input_mask = []\n",
-    "    all_segment_ids = []\n",
-    "    all_start_positions = []\n",
-    "    all_end_positions = []\n",
-    "\n",
-    "    for feature in features:\n",
-    "        all_unique_ids.append(feature.unique_id)\n",
-    "        all_input_ids.append(feature.input_ids)\n",
-    "        all_input_mask.append(feature.input_mask)\n",
-    "        all_segment_ids.append(feature.segment_ids)\n",
-    "        all_start_positions.append(feature.start_position)\n",
-    "        all_end_positions.append(feature.end_position)\n",
-    "\n",
-    "    def input_fn(params):\n",
-    "        \"\"\"The actual input function.\"\"\"\n",
-    "        batch_size = params[\"batch_size\"]\n",
-    "\n",
-    "        num_examples = len(features)\n",
-    "\n",
-    "        # This is for demo purposes and does NOT scale to large data sets. We do\n",
-    "        # not use Dataset.from_generator() because that uses tf.py_func which is\n",
-    "        # not TPU compatible. The right way to load data is with TFRecordReader.\n",
-    "        feature_map = {\n",
-    "            \"unique_ids\":\n",
-    "                tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),\n",
-    "            \"input_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_input_ids, shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"input_mask\":\n",
-    "                tf.constant(\n",
-    "                    all_input_mask,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"segment_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_segment_ids,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"start_positions\":\n",
-    "                tf.constant(\n",
-    "                    all_start_positions,\n",
-    "                    shape=[num_examples],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"end_positions\":\n",
-    "                tf.constant(\n",
-    "                    all_end_positions,\n",
-    "                    shape=[num_examples],\n",
-    "                    dtype=tf.int32),\n",
-    "        }\n",
-    "\n",
-    "        d = tf.data.Dataset.from_tensor_slices(feature_map)\n",
-    "        d = d.repeat()\n",
-    "        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)\n",
-    "        return d\n",
-    "\n",
-    "    return input_fn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.601666Z",
-     "start_time": "2018-11-06T10:11:37.560082Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def model_fn_builder(bert_config, init_checkpoint, learning_rate,\n",
-    "                     num_train_steps, num_warmup_steps, use_tpu,\n",
-    "                     use_one_hot_embeddings):\n",
-    "    \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
-    "\n",
-    "    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
-    "        \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
-    "\n",
-    "        tf.logging.info(\"*** Features ***\")\n",
-    "        for name in sorted(features.keys()):\n",
-    "            tf.logging.info(\"  name = %s, shape = %s\" % (name, features[name].shape))\n",
-    "\n",
-    "        unique_ids = features[\"unique_ids\"]\n",
-    "        input_ids = features[\"input_ids\"]\n",
-    "        input_mask = features[\"input_mask\"]\n",
-    "        segment_ids = features[\"segment_ids\"]\n",
-    "\n",
-    "        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
-    "\n",
-    "        (start_logits, end_logits) = create_model(\n",
-    "            bert_config=bert_config,\n",
-    "            is_training=is_training,\n",
-    "            input_ids=input_ids,\n",
-    "            input_mask=input_mask,\n",
-    "            segment_ids=segment_ids,\n",
-    "            use_one_hot_embeddings=use_one_hot_embeddings)\n",
-    "\n",
-    "        tvars = tf.trainable_variables()\n",
-    "\n",
-    "        initialized_variable_names = {}\n",
-    "        scaffold_fn = None\n",
-    "        if init_checkpoint:\n",
-    "            (assignment_map,\n",
-    "             initialized_variable_names) = modeling_tensorflow.get_assigment_map_from_checkpoint(\n",
-    "                tvars, init_checkpoint)\n",
-    "            if use_tpu:\n",
-    "\n",
-    "                def tpu_scaffold():\n",
-    "                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "                    return tf.train.Scaffold()\n",
-    "\n",
-    "                scaffold_fn = tpu_scaffold\n",
-    "            else:\n",
-    "                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "\n",
-    "        tf.logging.info(\"**** Trainable Variables ****\")\n",
-    "        for var in tvars:\n",
-    "            init_string = \"\"\n",
-    "            if var.name in initialized_variable_names:\n",
-    "                init_string = \", *INIT_FROM_CKPT*\"\n",
-    "            tf.logging.info(\"  name = %s, shape = %s%s\", var.name, var.shape,\n",
-    "                            init_string)\n",
-    "\n",
-    "        output_spec = None\n",
-    "        if mode == tf.estimator.ModeKeys.TRAIN:\n",
-    "            seq_length = modeling_tensorflow.get_shape_list(input_ids)[1]\n",
-    "\n",
-    "            def compute_loss(logits, positions):\n",
-    "                one_hot_positions = tf.one_hot(\n",
-    "                    positions, depth=seq_length, dtype=tf.float32)\n",
-    "                log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
-    "                loss = -tf.reduce_mean(\n",
-    "                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\n",
-    "                return loss\n",
-    "\n",
-    "            start_positions = features[\"start_positions\"]\n",
-    "            end_positions = features[\"end_positions\"]\n",
-    "\n",
-    "            start_loss = compute_loss(start_logits, start_positions)\n",
-    "            end_loss = compute_loss(end_logits, end_positions)\n",
-    "\n",
-    "            total_loss = (start_loss + end_loss) / 2.0\n",
-    "\n",
-    "            train_op = optimization.create_optimizer(\n",
-    "                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\n",
-    "\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode,\n",
-    "                loss=total_loss,\n",
-    "                train_op=train_op,\n",
-    "                scaffold_fn=scaffold_fn)\n",
-    "        elif mode == tf.estimator.ModeKeys.PREDICT:\n",
-    "            batch_size = modeling_tensorflow.get_shape_list(start_logits)[0]\n",
-    "            seq_length = modeling_tensorflow.get_shape_list(input_ids)[1]\n",
-    "\n",
-    "            def compute_loss(logits, positions):\n",
-    "                one_hot_positions = tf.one_hot(\n",
-    "                    positions, depth=seq_length, dtype=tf.float32)\n",
-    "                log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
-    "                loss = -tf.reduce_mean(\n",
-    "                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\n",
-    "                return loss\n",
-    "\n",
-    "            start_positions = features[\"start_positions\"]\n",
-    "            end_positions = features[\"end_positions\"]\n",
-    "\n",
-    "            start_loss = compute_loss(start_logits, start_positions)\n",
-    "            end_loss = compute_loss(end_logits, end_positions)\n",
-    "\n",
-    "            total_loss = (start_loss + end_loss) / 2.0\n",
-    "\n",
-    "            predictions = {\n",
-    "                \"unique_ids\": unique_ids,\n",
-    "                \"start_logits\": start_logits,\n",
-    "                \"end_logits\": end_logits,\n",
-    "                \"total_loss\": tf.reshape(total_loss, [batch_size, 1]),\n",
-    "                \"start_loss\": tf.reshape(start_loss, [batch_size, 1]),\n",
-    "                \"end_loss\": tf.reshape(end_loss, [batch_size, 1]),\n",
-    "            }\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\n",
-    "        else:\n",
-    "            raise ValueError(\n",
-    "                \"Only TRAIN and PREDICT modes are supported: %s\" % (mode))\n",
-    "\n",
-    "        return output_spec\n",
-    "\n",
-    "    return model_fn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:41.104542Z",
-     "start_time": "2018-11-06T10:11:37.603474Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x120df3f28>) includes params argument, but params are not passed to Estimator.\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/tmp/squad_base/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11fd09630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    cluster=None,\n",
-    "    master=None,\n",
-    "    model_dir=output_dir,\n",
-    "    save_checkpoints_steps=1000,\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        iterations_per_loop=1000,\n",
-    "        num_shards=8,\n",
-    "        per_host_input_for_training=is_per_host))\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    init_checkpoint=init_checkpoint,\n",
-    "    learning_rate=learning_rate,\n",
-    "    num_train_steps=None,\n",
-    "    num_warmup_steps=None,\n",
-    "    use_tpu=False,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=False,\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    train_batch_size=12,\n",
-    "    predict_batch_size=1)\n",
-    "\n",
-    "predict_input_fn = input_fn_builder(\n",
-    "    features=eval_features,\n",
-    "    seq_length=max_seq_length,\n",
-    "    drop_remainder=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.857601Z",
-     "start_time": "2018-11-06T10:11:41.106219Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /tmp/squad_base/, running initialization to predict.\n",
-      "INFO:tensorflow:Calling model_fn.\n",
-      "INFO:tensorflow:Running infer on CPU\n",
-      "INFO:tensorflow:*** Features ***\n",
-      "INFO:tensorflow:  name = end_positions, shape = (1,)\n",
-      "INFO:tensorflow:  name = input_ids, shape = (1, 384)\n",
-      "INFO:tensorflow:  name = input_mask, shape = (1, 384)\n",
-      "INFO:tensorflow:  name = segment_ids, shape = (1, 384)\n",
-      "INFO:tensorflow:  name = start_positions, shape = (1,)\n",
-      "INFO:tensorflow:  name = unique_ids, shape = (1,)\n",
-      "INFO:tensorflow:**** Trainable Variables ****\n",
-      "INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = cls/squad/output_weights:0, shape = (2, 768)\n",
-      "INFO:tensorflow:  name = cls/squad/output_bias:0, shape = (2,)\n",
-      "INFO:tensorflow:Done calling model_fn.\n",
-      "INFO:tensorflow:Graph was finalized.\n",
-      "INFO:tensorflow:Running local_init_op.\n",
-      "INFO:tensorflow:Done running local_init_op.\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_all_out = []\n",
-    "tensorflow_all_results = []\n",
-    "for result in estimator.predict(predict_input_fn, yield_single_examples=True):\n",
-    "    unique_id = int(result[\"unique_ids\"])\n",
-    "    eval_feature = eval_unique_id_to_feature[unique_id]\n",
-    "    start_logits = result[\"start_logits\"]\n",
-    "    end_logits = result[\"end_logits\"]\n",
-    "    total_loss = result[\"total_loss\"]\n",
-    "    start_loss = result[\"start_loss\"]\n",
-    "    end_loss = result[\"end_loss\"]\n",
-    "\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    output_json[\"tokens\"] = [token for (i, token) in enumerate(eval_feature.tokens)]\n",
-    "    output_json[\"start_logits\"] = [round(float(x), 6) for x in start_logits.flat]\n",
-    "    output_json[\"end_logits\"] = [round(float(x), 6) for x in end_logits.flat]\n",
-    "    output_json[\"total_loss\"] = [round(float(x), 6) for x in total_loss.flat]\n",
-    "    output_json[\"start_loss\"] = [round(float(x), 6) for x in start_loss.flat]\n",
-    "    output_json[\"end_loss\"] = [round(float(x), 6) for x in end_loss.flat]\n",
-    "    tensorflow_all_out.append(output_json)\n",
-    "    tensorflow_all_results.append(RawResult(\n",
-    "                                    unique_id=unique_id,\n",
-    "                                    start_logits=start_logits,\n",
-    "                                    end_logits=end_logits))\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.912836Z",
-     "start_time": "2018-11-06T10:11:47.859679Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [],
-   "source": [
-    "def _get_best_indexes(logits, n_best_size):\n",
-    "    \"\"\"Get the n-best logits from a list.\"\"\"\n",
-    "    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\n",
-    "\n",
-    "    best_indexes = []\n",
-    "    for i in range(len(index_and_score)):\n",
-    "        if i >= n_best_size:\n",
-    "            break\n",
-    "        best_indexes.append(index_and_score[i][0])\n",
-    "    return best_indexes\n",
-    "\n",
-    "def _compute_softmax(scores):\n",
-    "    \"\"\"Compute softmax probability over raw logits.\"\"\"\n",
-    "    if not scores:\n",
-    "        return []\n",
-    "\n",
-    "    max_score = None\n",
-    "    for score in scores:\n",
-    "        if max_score is None or score > max_score:\n",
-    "            max_score = score\n",
-    "\n",
-    "    exp_scores = []\n",
-    "    total_sum = 0.0\n",
-    "    for score in scores:\n",
-    "        x = math.exp(score - max_score)\n",
-    "        exp_scores.append(x)\n",
-    "        total_sum += x\n",
-    "\n",
-    "    probs = []\n",
-    "    for score in exp_scores:\n",
-    "        probs.append(score / total_sum)\n",
-    "    return probs\n",
-    "\n",
-    "\n",
-    "def compute_predictions(all_examples, all_features, all_results, n_best_size,\n",
-    "                      max_answer_length, do_lower_case):\n",
-    "    \"\"\"Compute final predictions.\"\"\"\n",
-    "    example_index_to_features = collections.defaultdict(list)\n",
-    "    for feature in all_features:\n",
-    "        example_index_to_features[feature.example_index].append(feature)\n",
-    "\n",
-    "    unique_id_to_result = {}\n",
-    "    for result in all_results:\n",
-    "        unique_id_to_result[result.unique_id] = result\n",
-    "\n",
-    "    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n",
-    "        \"PrelimPrediction\",\n",
-    "        [\"feature_index\", \"start_index\", \"end_index\", \"start_logit\", \"end_logit\"])\n",
-    "\n",
-    "    all_predictions = collections.OrderedDict()\n",
-    "    all_nbest_json = collections.OrderedDict()\n",
-    "    for (example_index, example) in enumerate(all_examples):\n",
-    "        features = example_index_to_features[example_index]\n",
-    "\n",
-    "        prelim_predictions = []\n",
-    "        for (feature_index, feature) in enumerate(features):\n",
-    "            result = unique_id_to_result[feature.unique_id]\n",
-    "\n",
-    "            start_indexes = _get_best_indexes(result.start_logits, n_best_size)\n",
-    "            end_indexes = _get_best_indexes(result.end_logits, n_best_size)\n",
-    "            for start_index in start_indexes:\n",
-    "                for end_index in end_indexes:\n",
-    "                    # We could hypothetically create invalid predictions, e.g., predict\n",
-    "                    # that the start of the span is in the question. We throw out all\n",
-    "                    # invalid predictions.\n",
-    "                    if start_index >= len(feature.tokens):\n",
-    "                        continue\n",
-    "                    if end_index >= len(feature.tokens):\n",
-    "                        continue\n",
-    "                    if start_index not in feature.token_to_orig_map:\n",
-    "                        continue\n",
-    "                    if end_index not in feature.token_to_orig_map:\n",
-    "                        continue\n",
-    "                    if not feature.token_is_max_context.get(start_index, False):\n",
-    "                        continue\n",
-    "                    if end_index < start_index:\n",
-    "                        continue\n",
-    "                    length = end_index - start_index + 1\n",
-    "                    if length > max_answer_length:\n",
-    "                        continue\n",
-    "                    prelim_predictions.append(\n",
-    "                        _PrelimPrediction(\n",
-    "                            feature_index=feature_index,\n",
-    "                            start_index=start_index,\n",
-    "                            end_index=end_index,\n",
-    "                            start_logit=result.start_logits[start_index],\n",
-    "                            end_logit=result.end_logits[end_index]))\n",
-    "\n",
-    "        prelim_predictions = sorted(\n",
-    "            prelim_predictions,\n",
-    "            key=lambda x: (x.start_logit + x.end_logit),\n",
-    "            reverse=True)\n",
-    "\n",
-    "        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n",
-    "            \"NbestPrediction\", [\"text\", \"start_logit\", \"end_logit\"])\n",
-    "\n",
-    "        seen_predictions = {}\n",
-    "        nbest = []\n",
-    "        for pred in prelim_predictions:\n",
-    "            if len(nbest) >= n_best_size:\n",
-    "                break\n",
-    "            feature = features[pred.feature_index]\n",
-    "\n",
-    "            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]\n",
-    "            orig_doc_start = feature.token_to_orig_map[pred.start_index]\n",
-    "            orig_doc_end = feature.token_to_orig_map[pred.end_index]\n",
-    "            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]\n",
-    "            tok_text = \" \".join(tok_tokens)\n",
-    "\n",
-    "            # De-tokenize WordPieces that have been split off.\n",
-    "            tok_text = tok_text.replace(\" ##\", \"\")\n",
-    "            tok_text = tok_text.replace(\"##\", \"\")\n",
-    "\n",
-    "            # Clean whitespace\n",
-    "            tok_text = tok_text.strip()\n",
-    "            tok_text = \" \".join(tok_text.split())\n",
-    "            orig_text = \" \".join(orig_tokens)\n",
-    "\n",
-    "            final_text = get_final_text(tok_text, orig_text, do_lower_case)\n",
-    "            if final_text in seen_predictions:\n",
-    "                continue\n",
-    "\n",
-    "            seen_predictions[final_text] = True\n",
-    "            nbest.append(\n",
-    "                _NbestPrediction(\n",
-    "                    text=final_text,\n",
-    "                    start_logit=pred.start_logit,\n",
-    "                    end_logit=pred.end_logit))\n",
-    "\n",
-    "        # In very rare edge cases we could have no valid predictions. So we\n",
-    "        # just create a nonce prediction in this case to avoid failure.\n",
-    "        if not nbest:\n",
-    "            nbest.append(\n",
-    "                _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n",
-    "\n",
-    "        assert len(nbest) >= 1\n",
-    "\n",
-    "        total_scores = []\n",
-    "        for entry in nbest:\n",
-    "            total_scores.append(entry.start_logit + entry.end_logit)\n",
-    "\n",
-    "        probs = _compute_softmax(total_scores)\n",
-    "\n",
-    "        nbest_json = []\n",
-    "        for (i, entry) in enumerate(nbest):\n",
-    "            output = collections.OrderedDict()\n",
-    "            output[\"text\"] = entry.text\n",
-    "            output[\"probability\"] = probs[i]\n",
-    "            output[\"start_logit\"] = entry.start_logit\n",
-    "            output[\"end_logit\"] = entry.end_logit\n",
-    "            nbest_json.append(output)\n",
-    "\n",
-    "        assert len(nbest_json) >= 1\n",
-    "\n",
-    "        all_predictions[example.qas_id] = nbest_json[0][\"text\"]\n",
-    "        all_nbest_json[example.qas_id] = nbest_json\n",
-    "\n",
-    "    return all_predictions, all_nbest_json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.953205Z",
-     "start_time": "2018-11-06T10:11:47.914751Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "all_predictions, all_nbest_json = compute_predictions(eval_examples[:1], eval_features[:1], tensorflow_all_results, 20, max_answer_length, True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.994647Z",
-     "start_time": "2018-11-06T10:11:47.955015Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "OrderedDict([('5733be284776f41900661182',\n",
-       "              [OrderedDict([('text', 'empty'),\n",
-       "                            ('probability', 1.0),\n",
-       "                            ('start_logit', 0.0),\n",
-       "                            ('end_logit', 0.0)])])])"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_nbest_json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.028473Z",
-     "start_time": "2018-11-06T10:11:47.996311Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "7\n",
-      "odict_keys(['linex_index', 'tokens', 'start_logits', 'end_logits', 'total_loss', 'start_loss', 'end_loss'])\n",
-      "number of tokens 176\n",
-      "number of start_logits 384\n",
-      "shape of end_logits 384\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(tensorflow_all_out))\n",
-    "print(len(tensorflow_all_out[0]))\n",
-    "print(tensorflow_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(tensorflow_all_out[0]['tokens']))\n",
-    "print(\"number of start_logits\", len(tensorflow_all_out[0]['start_logits']))\n",
-    "print(\"shape of end_logits\", len(tensorflow_all_out[0]['end_logits']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.060658Z",
-     "start_time": "2018-11-06T10:11:48.030289Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "tensorflow_outputs = [tensorflow_all_out[0]['start_logits'], tensorflow_all_out[0]['end_logits'],\n",
-    "                     tensorflow_all_out[0]['total_loss'], tensorflow_all_out[0]['start_loss'],\n",
-    "                     tensorflow_all_out[0]['end_loss']]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2/ PyTorch code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.478814Z",
-     "start_time": "2018-11-06T10:11:48.062585Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import modeling\n",
-    "from run_squad import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.512607Z",
-     "start_time": "2018-11-06T10:11:48.480729Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "init_checkpoint_pt = \"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:51.023405Z",
-     "start_time": "2018-11-06T10:11:48.514306Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([0., 0.])"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "device = torch.device(\"cpu\")\n",
-    "model = modeling.BertForQuestionAnswering(bert_config)\n",
-    "model.bert.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
-    "model.to(device)\n",
-    "model.qa_outputs.weight.data.fill_(1.0)\n",
-    "model.qa_outputs.bias.data.zero_()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:51.079364Z",
-     "start_time": "2018-11-06T10:11:51.028228Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [],
-   "source": [
-    "all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n",
-    "all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n",
-    "all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n",
-    "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
-    "all_start_positions = torch.tensor([[f.start_position] for f in eval_features], dtype=torch.long)\n",
-    "all_end_positions = torch.tensor([[f.end_position] for f in eval_features], dtype=torch.long)\n",
-    "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n",
-    "                                   all_start_positions, all_end_positions, all_example_index)\n",
-    "eval_sampler = SequentialSampler(eval_data)\n",
-    "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
-    "\n",
-    "model.eval()\n",
-    "None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:51.114686Z",
-     "start_time": "2018-11-06T10:11:51.081474Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[torch.Size([1, 384]), torch.Size([1, 384]), torch.Size([1, 384]), torch.Size([1, 1]), torch.Size([1, 1]), torch.Size([1])]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([1, 1])"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "batch = iter(eval_dataloader).next()\n",
-    "input_ids, input_mask, segment_ids, start_positions, end_positions, example_index = batch\n",
-    "print([t.shape for t in batch])\n",
-    "start_positions.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.298367Z",
-     "start_time": "2018-11-06T10:11:51.116219Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluating:   0%|          | 0/270 [00:00<?, ?it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "pytorch_all_out = []\n",
-    "for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n",
-    "    input_ids, input_mask, segment_ids, start_positions, end_positions, example_index = batch\n",
-    "    input_ids = input_ids.to(device)\n",
-    "    input_mask = input_mask.to(device)\n",
-    "    segment_ids = segment_ids.to(device)\n",
-    "    start_positions = start_positions.to(device)\n",
-    "    end_positions = end_positions.to(device)\n",
-    "\n",
-    "    total_loss, (start_logits, end_logits) = model(input_ids, segment_ids, input_mask, start_positions, end_positions)\n",
-    "    \n",
-    "    eval_feature = eval_features[example_index.item()]\n",
-    "\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    output_json[\"tokens\"] = [token for (i, token) in enumerate(eval_feature.tokens)]\n",
-    "    output_json[\"total_loss\"] = total_loss.detach().cpu().numpy()\n",
-    "    output_json[\"start_logits\"] = start_logits.detach().cpu().numpy()\n",
-    "    output_json[\"end_logits\"] = end_logits.detach().cpu().numpy()\n",
-    "    pytorch_all_out.append(output_json)\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.339553Z",
-     "start_time": "2018-11-06T10:11:52.300335Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "5\n",
-      "odict_keys(['linex_index', 'tokens', 'total_loss', 'start_logits', 'end_logits'])\n",
-      "number of tokens 176\n",
-      "number of start_logits 1\n",
-      "number of end_logits 1\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(pytorch_all_out))\n",
-    "print(len(pytorch_all_out[0]))\n",
-    "print(pytorch_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(pytorch_all_out[0]['tokens']))\n",
-    "print(\"number of start_logits\", len(pytorch_all_out[0]['start_logits']))\n",
-    "print(\"number of end_logits\", len(pytorch_all_out[0]['end_logits']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.372827Z",
-     "start_time": "2018-11-06T10:11:52.341393Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "pytorch_outputs = [pytorch_all_out[0]['start_logits'], pytorch_all_out[0]['end_logits'], pytorch_all_out[0]['total_loss']]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3/ Comparing the standard deviation of start_logits, end_logits and loss of both models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.402814Z",
-     "start_time": "2018-11-06T10:11:52.374329Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.434743Z",
-     "start_time": "2018-11-06T10:11:52.404345Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape tensorflow layer, shape pytorch layer, standard deviation\n",
-      "((384,), (1, 384), 5.244962470555037e-06)\n",
-      "((384,), (1, 384), 5.244962470555037e-06)\n",
-      "((1,), (), 4.560241698925438e-06)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('shape tensorflow layer, shape pytorch layer, standard deviation')\n",
-    "print('\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\n",
-    "                          np.array(pytorch_outputs[i]).shape, \n",
-    "                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(3))))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:12:54.200059Z",
-     "start_time": "2018-11-06T10:12:54.167355Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total loss of the TF model 9.06024 - Total loss of the PT model 9.0602445602417\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Total loss of the TF model {} - Total loss of the PT model {}\".format(tensorflow_outputs[2][0], pytorch_outputs[2]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python [default]",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
-  },
-  "toc": {
-   "colors": {
-    "hover_highlight": "#DAA520",
-    "running_highlight": "#FF0000",
-    "selected_highlight": "#FFD700"
-   },
-   "moveMenuLeft": true,
-   "nav_menu": {
-    "height": "48px",
-    "width": "252px"
-   },
-   "navigate_menu": true,
-   "number_sections": true,
-   "sideBar": true,
-   "threshold": 4,
-   "toc_cell": false,
-   "toc_section_display": "block",
-   "toc_window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebooks/Comparing-TF-and-PT-models.ipynb b/notebooks/Comparing-TF-and-PT-models.ipynb
deleted file mode 100644
index b7382e4652..0000000000
--- a/notebooks/Comparing-TF-and-PT-models.ipynb
+++ /dev/null
@@ -1,1318 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing TensorFlow (original) and PyTorch models\n",
-    "\n",
-    "You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\n",
-    "\n",
-    "To run this notebook, follow these instructions:\n",
-    "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n",
-    "- download the original TensorFlow implementation,\n",
-    "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n",
-    "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n",
-    "\n",
-    "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:56:48.412622Z",
-     "start_time": "2018-11-15T14:56:48.400110Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.chdir('../')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1/ TensorFlow code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:56:49.483829Z",
-     "start_time": "2018-11-15T14:56:49.471296Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "original_tf_inplem_dir = \"./tensorflow_code/\"\n",
-    "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n",
-    "\n",
-    "vocab_file = model_dir + \"vocab.txt\"\n",
-    "bert_config_file = model_dir + \"bert_config.json\"\n",
-    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
-    "\n",
-    "input_file = \"./samples/input.txt\"\n",
-    "max_seq_length = 128"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:57:51.597932Z",
-     "start_time": "2018-11-15T14:57:51.549466Z"
-    }
-   },
-   "outputs": [
-    {
-     "ename": "DuplicateFlagError",
-     "evalue": "The flag 'input_file' is defined twice. First from *, Second from *.  Description from first occurrence: (no help available)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mDuplicateFlagError\u001b[0m                        Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-6-86ecffb49060>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspec_from_file_location\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'*'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moriginal_tf_inplem_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'/extract_features_tensorflow.py'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_from_spec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexec_module\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'extract_features_tensorflow'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/importlib/_bootstrap_external.py\u001b[0m in \u001b[0;36mexec_module\u001b[0;34m(self, module)\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/importlib/_bootstrap.py\u001b[0m in \u001b[0;36m_call_with_frames_removed\u001b[0;34m(f, *args, **kwds)\u001b[0m\n",
-      "\u001b[0;32m~/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/tensorflow_code/extract_features_tensorflow.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     32\u001b[0m \u001b[0mFLAGS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFLAGS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 34\u001b[0;31m \u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFINE_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"input_file\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     35\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFINE_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"output_file\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/tensorflow/python/platform/flags.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     56\u001b[0m           \u001b[0;34m'Use of the keyword argument names (flag_name, default_value, '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m           'docstring) is deprecated, please use (name, default, help) instead.')\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0moriginal_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m   \u001b[0;32mreturn\u001b[0m \u001b[0mtf_decorator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake_decorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginal_function\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\u001b[0m in \u001b[0;36mDEFINE_string\u001b[0;34m(name, default, help, flag_values, **args)\u001b[0m\n\u001b[1;32m    239\u001b[0m   \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_argument_parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArgumentParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    240\u001b[0m   \u001b[0mserializer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_argument_parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArgumentSerializer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 241\u001b[0;31m   \u001b[0mDEFINE\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparser\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhelp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflag_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserializer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\u001b[0m in \u001b[0;36mDEFINE\u001b[0;34m(parser, name, default, help, flag_values, serializer, module_name, **args)\u001b[0m\n\u001b[1;32m     80\u001b[0m   \"\"\"\n\u001b[1;32m     81\u001b[0m   DEFINE_flag(_flag.Flag(parser, serializer, name, default, help, **args),\n\u001b[0;32m---> 82\u001b[0;31m               flag_values, module_name)\n\u001b[0m\u001b[1;32m     83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\u001b[0m in \u001b[0;36mDEFINE_flag\u001b[0;34m(flag, flag_values, module_name)\u001b[0m\n\u001b[1;32m    102\u001b[0m   \u001b[0;31m# Copying the reference to flag_values prevents pychecker warnings.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    103\u001b[0m   \u001b[0mfv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflag_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m   \u001b[0mfv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mflag\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    105\u001b[0m   \u001b[0;31m# Tell flag_values who's defining the flag.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    106\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0mmodule_name\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_flagvalues.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, name, flag)\u001b[0m\n\u001b[1;32m    427\u001b[0m         \u001b[0;31m# module is simply being imported a subsequent time.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    428\u001b[0m         \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m       \u001b[0;32mraise\u001b[0m \u001b[0m_exceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDuplicateFlagError\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_flag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    430\u001b[0m     \u001b[0mshort_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflag\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshort_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    431\u001b[0m     \u001b[0;31m# If a new flag overrides an old one, we need to cleanup the old flag's\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mDuplicateFlagError\u001b[0m: The flag 'input_file' is defined twice. First from *, Second from *.  Description from first occurrence: (no help available)"
-     ]
-    }
-   ],
-   "source": [
-    "import importlib.util\n",
-    "import sys\n",
-    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features_tensorflow.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['extract_features_tensorflow'] = module\n",
-    "\n",
-    "from extract_features_tensorflow import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:05.650987Z",
-     "start_time": "2018-11-15T14:58:05.541620Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 0\n",
-      "INFO:tensorflow:tokens: [CLS] who was jim henson ? [SEP] jim henson was a puppet ##eer [SEP]\n",
-      "INFO:tensorflow:input_ids: 101 2040 2001 3958 27227 1029 102 3958 27227 2001 1037 13997 11510 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    }
-   ],
-   "source": [
-    "layer_indexes = list(range(12))\n",
-    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = tokenization.FullTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "examples = read_examples(input_file)\n",
-    "\n",
-    "features = convert_examples_to_features(\n",
-    "    examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)\n",
-    "unique_id_to_feature = {}\n",
-    "for feature in features:\n",
-    "    unique_id_to_feature[feature.unique_id] = feature"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:11.562443Z",
-     "start_time": "2018-11-15T14:58:08.036485Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x11ea7f1e0>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x121b163c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
-      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    master=None,\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        num_shards=1,\n",
-    "        per_host_input_for_training=is_per_host))\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    init_checkpoint=init_checkpoint,\n",
-    "    layer_indexes=layer_indexes,\n",
-    "    use_tpu=False,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "\n",
-    "# If TPU is not available, this will fall back to normal Estimator on CPU\n",
-    "# or GPU.\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=False,\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    predict_batch_size=1)\n",
-    "\n",
-    "input_fn = input_fn_builder(\n",
-    "    features=features, seq_length=max_seq_length)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:21.736543Z",
-     "start_time": "2018-11-15T14:58:16.723829Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9, running initialization to predict.\n",
-      "INFO:tensorflow:Calling model_fn.\n",
-      "INFO:tensorflow:Running infer on CPU\n",
-      "INFO:tensorflow:Done calling model_fn.\n",
-      "INFO:tensorflow:Graph was finalized.\n",
-      "INFO:tensorflow:Running local_init_op.\n",
-      "INFO:tensorflow:Done running local_init_op.\n",
-      "extracting layer 0\n",
-      "extracting layer 1\n",
-      "extracting layer 2\n",
-      "extracting layer 3\n",
-      "extracting layer 4\n",
-      "extracting layer 5\n",
-      "extracting layer 6\n",
-      "extracting layer 7\n",
-      "extracting layer 8\n",
-      "extracting layer 9\n",
-      "extracting layer 10\n",
-      "extracting layer 11\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_all_out = []\n",
-    "for result in estimator.predict(input_fn, yield_single_examples=True):\n",
-    "    unique_id = int(result[\"unique_id\"])\n",
-    "    feature = unique_id_to_feature[unique_id]\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    tensorflow_all_out_features = []\n",
-    "    # for (i, token) in enumerate(feature.tokens):\n",
-    "    all_layers = []\n",
-    "    for (j, layer_index) in enumerate(layer_indexes):\n",
-    "        print(\"extracting layer {}\".format(j))\n",
-    "        layer_output = result[\"layer_output_%d\" % j]\n",
-    "        layers = collections.OrderedDict()\n",
-    "        layers[\"index\"] = layer_index\n",
-    "        layers[\"values\"] = layer_output\n",
-    "        all_layers.append(layers)\n",
-    "    tensorflow_out_features = collections.OrderedDict()\n",
-    "    tensorflow_out_features[\"layers\"] = all_layers\n",
-    "    tensorflow_all_out_features.append(tensorflow_out_features)\n",
-    "\n",
-    "    output_json[\"features\"] = tensorflow_all_out_features\n",
-    "    tensorflow_all_out.append(output_json)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:23.970714Z",
-     "start_time": "2018-11-15T14:58:23.931930Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "2\n",
-      "odict_keys(['linex_index', 'features'])\n",
-      "number of tokens 1\n",
-      "number of layers 12\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(128, 768)"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(len(tensorflow_all_out))\n",
-    "print(len(tensorflow_all_out[0]))\n",
-    "print(tensorflow_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n",
-    "print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n",
-    "tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:25.547012Z",
-     "start_time": "2018-11-15T14:58:25.516076Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2/ PyTorch code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.chdir('./examples')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:03:49.528679Z",
-     "start_time": "2018-11-15T15:03:49.497697Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import extract_features\n",
-    "import pytorch_transformers as ppb\n",
-    "from extract_features import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:18.001177Z",
-     "start_time": "2018-11-15T15:21:17.970369Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "init_checkpoint_pt = \"../../google_models/uncased_L-12_H-768_A-12/\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:20.893669Z",
-     "start_time": "2018-11-15T15:21:18.786623Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"max_position_embeddings\": 512,\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"type_vocab_size\": 2,\n",
-      "  \"vocab_size\": 30522\n",
-      "}\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BertEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): BertLayerNorm()\n",
-       "    (dropout): Dropout(p=0.1)\n",
-       "  )\n",
-       "  (encoder): BertEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (1): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (2): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (3): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (4): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (5): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (6): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (7): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (8): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (9): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (10): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (11): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (pooler): BertPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "device = torch.device(\"cpu\")\n",
-    "model = ppb.BertModel.from_pretrained(init_checkpoint_pt)\n",
-    "model.to(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:26.963427Z",
-     "start_time": "2018-11-15T15:21:26.922494Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BertEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): BertLayerNorm()\n",
-       "    (dropout): Dropout(p=0.1)\n",
-       "  )\n",
-       "  (encoder): BertEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (1): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (2): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (3): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (4): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (5): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (6): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (7): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (8): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (9): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (10): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (11): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (pooler): BertPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
-    "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
-    "all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n",
-    "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
-    "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n",
-    "eval_sampler = SequentialSampler(eval_data)\n",
-    "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
-    "\n",
-    "model.eval()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:30.718724Z",
-     "start_time": "2018-11-15T15:21:30.329205Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,\n",
-      "          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0]])\n",
-      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "tensor([0])\n",
-      "layer 0 0\n",
-      "layer 1 1\n",
-      "layer 2 2\n",
-      "layer 3 3\n",
-      "layer 4 4\n",
-      "layer 5 5\n",
-      "layer 6 6\n",
-      "layer 7 7\n",
-      "layer 8 8\n",
-      "layer 9 9\n",
-      "layer 10 10\n",
-      "layer 11 11\n"
-     ]
-    }
-   ],
-   "source": [
-    "layer_indexes = list(range(12))\n",
-    "\n",
-    "pytorch_all_out = []\n",
-    "for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n",
-    "    print(input_ids)\n",
-    "    print(input_mask)\n",
-    "    print(example_indices)\n",
-    "    input_ids = input_ids.to(device)\n",
-    "    input_mask = input_mask.to(device)\n",
-    "\n",
-    "    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n",
-    "\n",
-    "    for b, example_index in enumerate(example_indices):\n",
-    "        feature = features[example_index.item()]\n",
-    "        unique_id = int(feature.unique_id)\n",
-    "        # feature = unique_id_to_feature[unique_id]\n",
-    "        output_json = collections.OrderedDict()\n",
-    "        output_json[\"linex_index\"] = unique_id\n",
-    "        all_out_features = []\n",
-    "        # for (i, token) in enumerate(feature.tokens):\n",
-    "        all_layers = []\n",
-    "        for (j, layer_index) in enumerate(layer_indexes):\n",
-    "            print(\"layer\", j, layer_index)\n",
-    "            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n",
-    "            layer_output = layer_output[b]\n",
-    "            layers = collections.OrderedDict()\n",
-    "            layers[\"index\"] = layer_index\n",
-    "            layer_output = layer_output\n",
-    "            layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n",
-    "            all_layers.append(layers)\n",
-    "\n",
-    "            out_features = collections.OrderedDict()\n",
-    "            out_features[\"layers\"] = all_layers\n",
-    "            all_out_features.append(out_features)\n",
-    "        output_json[\"features\"] = all_out_features\n",
-    "        pytorch_all_out.append(output_json)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:35.703615Z",
-     "start_time": "2018-11-15T15:21:35.666150Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "2\n",
-      "odict_keys(['linex_index', 'features'])\n",
-      "number of tokens 1\n",
-      "number of layers 12\n",
-      "hidden_size 128\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(128, 768)"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(len(pytorch_all_out))\n",
-    "print(len(pytorch_all_out[0]))\n",
-    "print(pytorch_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(pytorch_all_out))\n",
-    "print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n",
-    "print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n",
-    "pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:36.999073Z",
-     "start_time": "2018-11-15T15:21:36.966762Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(128, 768)\n",
-      "(128, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n",
-    "print(pytorch_outputs[0].shape)\n",
-    "print(pytorch_outputs[1].shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:37.936522Z",
-     "start_time": "2018-11-15T15:21:37.905269Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(128, 768)\n",
-      "(128, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tensorflow_outputs[0].shape)\n",
-    "print(tensorflow_outputs[1].shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3/ Comparing the standard deviation on the last layer of both models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:39.437137Z",
-     "start_time": "2018-11-15T15:21:39.406150Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:40.181870Z",
-     "start_time": "2018-11-15T15:21:40.137023Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape tensorflow layer, shape pytorch layer, standard deviation\n",
-      "((128, 768), (128, 768), 1.5258875e-07)\n",
-      "((128, 768), (128, 768), 2.342731e-07)\n",
-      "((128, 768), (128, 768), 2.801949e-07)\n",
-      "((128, 768), (128, 768), 3.5904986e-07)\n",
-      "((128, 768), (128, 768), 4.2842768e-07)\n",
-      "((128, 768), (128, 768), 5.127951e-07)\n",
-      "((128, 768), (128, 768), 6.14668e-07)\n",
-      "((128, 768), (128, 768), 7.063922e-07)\n",
-      "((128, 768), (128, 768), 7.906173e-07)\n",
-      "((128, 768), (128, 768), 8.475192e-07)\n",
-      "((128, 768), (128, 768), 8.975489e-07)\n",
-      "((128, 768), (128, 768), 4.1671223e-07)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('shape tensorflow layer, shape pytorch layer, standard deviation')\n",
-    "print('\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\n",
-    "                          np.array(pytorch_outputs[i]).shape, \n",
-    "                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python [default]",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
-  },
-  "toc": {
-   "colors": {
-    "hover_highlight": "#DAA520",
-    "running_highlight": "#FF0000",
-    "selected_highlight": "#FFD700"
-   },
-   "moveMenuLeft": true,
-   "nav_menu": {
-    "height": "48px",
-    "width": "252px"
-   },
-   "navigate_menu": true,
-   "number_sections": true,
-   "sideBar": true,
-   "threshold": 4,
-   "toc_cell": false,
-   "toc_section_display": "block",
-   "toc_window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebooks/README.md b/notebooks/README.md
new file mode 100644
index 0000000000..894b2e7c82
--- /dev/null
+++ b/notebooks/README.md
@@ -0,0 +1,17 @@
+# Transformers Notebooks
+
+You can find here a list of the official notebooks provided by Hugging Face.
+
+Also, we would like to list here interesting content created by the community. 
+If you wrote some notebook(s) leveraging transformers and would like be listed here, please open a 
+Pull Request and we'll review it so it can be included here. 
+
+
+## Hugging Face's notebooks :hugs:
+
+| Notebook     |      Description      |   |
+|:----------|:-------------:|------:|
+| [Getting Started Tokenizers](01-training_tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
+| [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
+| [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
+| [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vochicong/blog/blob/fix-notebook-add-tokenizer-config/notebooks/01_how_to_train.ipynb)|
\ No newline at end of file

From 006097f8ad63636b1eb0ebc67a5b921d22d4c57f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 18:01:17 +0100
Subject: [PATCH 54/80] rename variables named 'word' to 'token' in generate fn
 (#3119)

* fix conflits

* fixed naming bug

* make style
---
 src/transformers/modeling_utils.py | 56 ++++++++++++++++--------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 3dc0f245c9..435e6c132b 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -242,7 +242,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         # initialize all new embeddings (in particular added tokens)
         self._init_weights(new_embeddings)
 
-        # Copy word embeddings from the previous weights
+        # Copy token embeddings from the previous weights
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
 
@@ -558,7 +558,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                         model.__class__.__name__, "\n\t".join(error_msgs)
                     )
                 )
-        model.tie_weights()  # make sure word embedding weights are still tied if needed
+        model.tie_weights()  # make sure token embedding weights are still tied if needed
 
         # Set model in evaluation mode to desactivate DropOut modules by default
         model.eval()
@@ -843,8 +843,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         """ Generate sequences for each example without beam search (num_beams == 1).
             All returned sequence are generated independantly.
         """
-        # current position / max lengths / length of generated sentences / unfinished sentences
-
+        # length of generated sentences / unfinished sentences
         unfinished_sents = input_ids.new(batch_size).fill_(1)
         sent_lengths = input_ids.new(batch_size).fill_(max_length)
 
@@ -934,7 +933,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         """
 
         # Expand input to num beams
-        # assert input_ids.shape == (batch_size * num_beams, cur_len)
         input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
         input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
 
@@ -946,7 +944,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         # scores for each sentence in the beam
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
 
-        # Greedy decoding it is made sure that only words of the first beam are considered to avoid sampling the exact same words three times
+        # Greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
         if do_sample is False:
             beam_scores[:, 1:] = -1e9
         beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
@@ -960,7 +958,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
             outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
-            scores = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
+            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
 
             # if model has past, then set the past variable to speed up decoding
             if self._do_output_past(outputs):
@@ -968,14 +966,16 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
             # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                self.enforce_repetition_penalty_(scores, batch_size, num_beams, input_ids, repetition_penalty)
+                self.enforce_repetition_penalty_(
+                    next_token_logits, batch_size, num_beams, input_ids, repetition_penalty
+                )
 
             if do_sample:
                 # Temperature (higher temperature => more likely to sample low probability tokens)
                 if temperature != 1.0:
-                    scores = scores / temperature
+                    next_token_logits = next_token_logits / temperature
 
-                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
                 _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
 
                 # Top-p/top-k filtering
@@ -988,25 +988,27 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                     batch_size, num_beams * vocab_size
                 )  # (batch_size, num_beams * vocab_size)
 
-                # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                next_tokens = torch.multinomial(
                     F.softmax(_scores, dim=-1), num_samples=2 * num_beams
                 )  # (batch_size, num_beams * 2)
 
                 # Compute next scores
-                next_scores = torch.gather(_scores, -1, next_words)  # (batch_size, num_beams * 2)
+                next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
 
             else:
                 # do greedy beam search
-                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
                 assert scores.size() == (batch_size * num_beams, vocab_size)
                 # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
                 # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                _scores = _scores.view(batch_size, num_beams * vocab_size)  # (batch_size, num_beams * vocab_size)
-                next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
+                next_scores = next_scores.view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+                next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
 
-            assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
+            assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
 
             # next batch beam content
             # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
@@ -1032,21 +1034,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 # next sentence beam content
                 next_sent_beam = []
 
-                # next words for this sentence
-                for idx, score in zip(next_words[batch_idx], next_scores[batch_idx]):
+                # next tokens for this sentence
+                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
 
                     # get beam and word IDs
                     beam_id = idx // vocab_size
-                    word_id = idx % vocab_size
+                    token_id = idx % vocab_size
 
                     # add to generated hypotheses if end of sentence or last iteration
-                    if eos_token_ids is not None and word_id.item() in eos_token_ids:
+                    if eos_token_ids is not None and token_id.item() in eos_token_ids:
                         generated_hyps[batch_idx].add(
                             input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item(),
                         )
                     else:
                         # add next predicted word if it is not eos_token
-                        next_sent_beam.append((score, word_id, batch_idx * num_beams + beam_id))
+                        next_sent_beam.append((score, token_id, batch_idx * num_beams + beam_id))
 
                     # the beam for next step is full
                     if len(next_sent_beam) == num_beams:
@@ -1060,12 +1062,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
             # sanity check / prepare next batch
             assert len(next_batch_beam) == batch_size * num_beams
             beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_words = input_ids.new([x[1] for x in next_batch_beam])
+            beam_tokens = input_ids.new([x[1] for x in next_batch_beam])
             beam_idx = input_ids.new([x[2] for x in next_batch_beam])
 
             # re-order batch
             input_ids = input_ids[beam_idx, :]
-            input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1)
+            input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
 
             # re-order internal states
             if past:
@@ -1081,11 +1083,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         for batch_idx in range(batch_size):
             # Add all open beam hypothesis to generated_hyps
             if not done[batch_idx]:
-                for idx, score in zip(next_words[batch_idx], next_scores[batch_idx]):
+                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
 
                     # get beam and word IDs
                     beam_id = idx // vocab_size
-                    word_id = idx % vocab_size
+                    token_id = idx % vocab_size
                     generated_hyps[batch_idx].add(
                         input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item()
                     )

From 256cbbc4a286a2fa4c1794870bcfb57994d76e3b Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 4 Mar 2020 12:01:45 -0500
Subject: [PATCH 55/80] [doc] Fix link to how-to-train Colab

---
 notebooks/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/README.md b/notebooks/README.md
index 894b2e7c82..a515fb1347 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -14,4 +14,4 @@ Pull Request and we'll review it so it can be included here.
 | [Getting Started Tokenizers](01-training_tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
 | [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
 | [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
-| [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vochicong/blog/blob/fix-notebook-add-tokenizer-config/notebooks/01_how_to_train.ipynb)|
\ No newline at end of file
+| [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|

From 932eab943df207e3c0040c7bfd4f1334e7c6c99e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 4 Mar 2020 18:03:46 +0100
Subject: [PATCH 56/80] include tf gpt2 tests for attn mask and past variable
 (#3122)

---
 tests/test_modeling_tf_gpt2.py | 76 ++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py
index 362f9e3162..8bb33898dc 100644
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -30,6 +30,7 @@ if is_tf_available():
         TFGPT2LMHeadModel,
         TFGPT2DoubleHeadsModel,
         TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        shape_list,
     )
 
 
@@ -167,6 +168,73 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
                 list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
             )
 
+        def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2Model(config=config)
+
+            # first forward pass
+            output, past = model(input_ids, token_type_ids=token_type_ids)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+            next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+            # append to next input_ids and token_type_ids
+            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+            next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
+
+            output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
+            output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
+
+            # select random slice
+            random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+            # test that outputs are equal for slice
+            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
+
+        def create_and_check_gpt2_model_attention_mask_past(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+        ):
+            model = TFGPT2Model(config=config)
+
+            # create attention mask
+            half_seq_length = self.seq_length // 2
+            attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
+            attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
+            attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
+
+            # first forward pass
+            output, past = model(input_ids, attention_mask=attn_mask)
+
+            # create hypothetical next token and extent to next_input_ids
+            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+            # change a random masked slice from input_ids
+            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
+            random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
+            vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
+            condition = tf.transpose(
+                tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
+            )
+            input_ids = tf.where(condition, random_other_next_tokens, input_ids)
+
+            # append to next input_ids and attn_mask
+            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+            attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
+
+            # get two different outputs
+            output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
+            output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
+
+            # select random slice
+            random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+            # test that outputs are equal for slice
+            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
+
         def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2LMHeadModel(config=config)
             inputs = {
@@ -237,6 +305,14 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
 
+    def test_gpt2_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
+
+    def test_gpt2_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
+
     def test_gpt2_lm_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)

From 6a143bf282a7ddf059c80ba4cb3bc770b93ec132 Mon Sep 17 00:00:00 2001
From: Wissam Antoun <44616226+WissamAntoun@users.noreply.github.com>
Date: Wed, 4 Mar 2020 20:04:39 +0300
Subject: [PATCH 57/80] model cards for both aubmindlab/bert-base-arabert
 models (#3113)

* Added readme for AraBERTv0.1

* Added readme to AraBERT
---
 .../aubmindlab/bert-base-arabert/README.md    | 89 +++++++++++++++++++
 .../aubmindlab/bert-base-arabertv01/README.md | 89 +++++++++++++++++++
 2 files changed, 178 insertions(+)
 create mode 100644 model_cards/aubmindlab/bert-base-arabert/README.md
 create mode 100644 model_cards/aubmindlab/bert-base-arabertv01/README.md

diff --git a/model_cards/aubmindlab/bert-base-arabert/README.md b/model_cards/aubmindlab/bert-base-arabert/README.md
new file mode 100644
index 0000000000..75d967fcd0
--- /dev/null
+++ b/model_cards/aubmindlab/bert-base-arabert/README.md
@@ -0,0 +1,89 @@
+# AraBERT : Pre-training BERT for Arabic Language Understanding
+
+**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
+
+There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+
+The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
+
+We evalaute both AraBERT models on different downstream tasks and compare it to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR), [ArSaS](http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
+
+## Results (Acc.)
+Task | prev. SOTA | mBERT | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:|:---:|:---:
+HARD |95.7 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|95.7|96.2|96.1
+ASTD |86.5 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)| 80.1|92.2|92.6
+ArsenTD-Lev|52.4 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|51|58.9|59.4
+AJGT|93 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)| 83.6|94.1|93.8
+LABR|87.5 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)|83|85.9|86.7
+ANERcorp|81.7 (BiLSTM-CRF)|78.4|84.2|81.9
+ARCD|mBERT|EM:34.2 F1: 61.3|EM:30.1 F1:61.2|EM:30.6 F1: 62.7
+
+*We would be extremly thankful if everyone can contibute to the Results table by adding more scores on different datasets*
+
+## How to use
+
+You can easily use AraBERT since it is almost fully compatible with existing codebases (You can use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
+
+To use HuggingFace's Transformer repository you only need to provide a lost of token that forces the model to not split them, also make sure that the text is pre-segmented:
+
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+
+arabert_tokenizer = AutoTokenizer.from_pretrained(
+    "aubmindlab/bert-base-arabert",
+    do_lower_case=False,
+    do_basic_tokenize=True,
+    never_split=never_split_tokens)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
+
+arabert_tokenizer.tokenize("و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري")
+
+>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
+```
+
+**AraBERTv0.1 is compatible with all existing libraries, since it needs no pre-segmentation.**
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+
+arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
+
+arabert_tokenizer.tokenize("ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري")
+
+>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
+```
+
+
+The ```araBERT_(initial_Demo_TF)_.ipynb``` Notebook is a small demo using the AJGT dataset using TensorFlow (GPU and TPU compatible).
+
+## Model Weights and Vocab Download
+Models | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:
+TensorFlow|[Drive Link](https://drive.google.com/open?id=1-kVmTUZZ4DP2rzeHNjTPkY8OjnQCpomO) | [Drive Link](https://drive.google.com/open?id=1-d7-9ljKgDJP5mx73uBtio-TuUZCqZnt)
+PyTorch| [Drive_Link](https://drive.google.com/open?id=1-_3te42mQCPD8SxwZ3l-VBL7yaJH-IOv)| [Drive_Link](https://drive.google.com/open?id=1-69s6Pxqbi63HOQ1M9wTcr-Ovc6PWLLo)
+
+**You can find the PyTorch models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
+
+## If you used this model please cite us as:
+```
+@misc{antoun2020arabert,
+    title={AraBERT: Transformer-based Model for Arabic Language Understanding},
+    author={Wissam Antoun and Fady Baly and Hazem Hajj},
+    year={2020},
+    eprint={2003.00104},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+## Acknowledgments 
+Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access.
+
+## Contacts
+**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/giulio-ravasio-3a81a9110/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | <wfa07@mail.aub.edu> | <wissam.antoun@gmail.com>
+
+**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/BalyFady) | [Github](https://github.com/fadybaly) | <fgb06@mail.aub.edu> | <baly.fady@gmail.com>
+
+***We are looking for sponsors to train BERT-Large and other Transformer models, the sponsor only needs to cover to data storage and compute cost of the generating the pretraining data***
diff --git a/model_cards/aubmindlab/bert-base-arabertv01/README.md b/model_cards/aubmindlab/bert-base-arabertv01/README.md
new file mode 100644
index 0000000000..75d967fcd0
--- /dev/null
+++ b/model_cards/aubmindlab/bert-base-arabertv01/README.md
@@ -0,0 +1,89 @@
+# AraBERT : Pre-training BERT for Arabic Language Understanding
+
+**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
+
+There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+
+The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
+
+We evalaute both AraBERT models on different downstream tasks and compare it to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR), [ArSaS](http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
+
+## Results (Acc.)
+Task | prev. SOTA | mBERT | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:|:---:|:---:
+HARD |95.7 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|95.7|96.2|96.1
+ASTD |86.5 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)| 80.1|92.2|92.6
+ArsenTD-Lev|52.4 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|51|58.9|59.4
+AJGT|93 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)| 83.6|94.1|93.8
+LABR|87.5 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)|83|85.9|86.7
+ANERcorp|81.7 (BiLSTM-CRF)|78.4|84.2|81.9
+ARCD|mBERT|EM:34.2 F1: 61.3|EM:30.1 F1:61.2|EM:30.6 F1: 62.7
+
+*We would be extremly thankful if everyone can contibute to the Results table by adding more scores on different datasets*
+
+## How to use
+
+You can easily use AraBERT since it is almost fully compatible with existing codebases (You can use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
+
+To use HuggingFace's Transformer repository you only need to provide a lost of token that forces the model to not split them, also make sure that the text is pre-segmented:
+
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+
+arabert_tokenizer = AutoTokenizer.from_pretrained(
+    "aubmindlab/bert-base-arabert",
+    do_lower_case=False,
+    do_basic_tokenize=True,
+    never_split=never_split_tokens)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
+
+arabert_tokenizer.tokenize("و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري")
+
+>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
+```
+
+**AraBERTv0.1 is compatible with all existing libraries, since it needs no pre-segmentation.**
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+
+arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
+
+arabert_tokenizer.tokenize("ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري")
+
+>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
+```
+
+
+The ```araBERT_(initial_Demo_TF)_.ipynb``` Notebook is a small demo using the AJGT dataset using TensorFlow (GPU and TPU compatible).
+
+## Model Weights and Vocab Download
+Models | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:
+TensorFlow|[Drive Link](https://drive.google.com/open?id=1-kVmTUZZ4DP2rzeHNjTPkY8OjnQCpomO) | [Drive Link](https://drive.google.com/open?id=1-d7-9ljKgDJP5mx73uBtio-TuUZCqZnt)
+PyTorch| [Drive_Link](https://drive.google.com/open?id=1-_3te42mQCPD8SxwZ3l-VBL7yaJH-IOv)| [Drive_Link](https://drive.google.com/open?id=1-69s6Pxqbi63HOQ1M9wTcr-Ovc6PWLLo)
+
+**You can find the PyTorch models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
+
+## If you used this model please cite us as:
+```
+@misc{antoun2020arabert,
+    title={AraBERT: Transformer-based Model for Arabic Language Understanding},
+    author={Wissam Antoun and Fady Baly and Hazem Hajj},
+    year={2020},
+    eprint={2003.00104},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+## Acknowledgments 
+Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access.
+
+## Contacts
+**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/giulio-ravasio-3a81a9110/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | <wfa07@mail.aub.edu> | <wissam.antoun@gmail.com>
+
+**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/BalyFady) | [Github](https://github.com/fadybaly) | <fgb06@mail.aub.edu> | <baly.fady@gmail.com>
+
+***We are looking for sponsors to train BERT-Large and other Transformer models, the sponsor only needs to cover to data storage and compute cost of the generating the pretraining data***

From ec60e0ae7a88e46ac2bfbf6234d14381a01be06a Mon Sep 17 00:00:00 2001
From: Manuel Romero <mrm8488@gmail.com>
Date: Tue, 3 Mar 2020 22:38:01 +0100
Subject: [PATCH 58/80] Create README.md

---
 .../bert-uncased-finetuned-qnli/README.md     | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 model_cards/mrm8488/bert-uncased-finetuned-qnli/README.md

diff --git a/model_cards/mrm8488/bert-uncased-finetuned-qnli/README.md b/model_cards/mrm8488/bert-uncased-finetuned-qnli/README.md
new file mode 100644
index 0000000000..d13f4e106a
--- /dev/null
+++ b/model_cards/mrm8488/bert-uncased-finetuned-qnli/README.md
@@ -0,0 +1,62 @@
+---
+language: english
+thumbnail:
+---
+
+# [BERT](https://huggingface.co/deepset/bert-base-cased-squad2) fine tuned on [QNLI](https://github.com/rhythmcao/QNLI)+ compression ([BERT-of-Theseus](https://github.com/JetRunner/BERT-of-Theseus))
+
+I used a [Bert model fine tuned on **SQUAD v2**](https://huggingface.co/deepset/bert-base-cased-squad2) and then I fine tuned it on **QNLI** using **compression** (with a constant replacing rate) as proposed in **BERT-of-Theseus**
+
+## Details of the downstream task (QNLI):
+
+### Getting the dataset
+```bash
+wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/train.tsv
+wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/test.tsv
+wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/dev.tsv
+
+mkdir QNLI_dataset
+mv *.tsv QNLI_dataset
+```
+
+### Model training
+
+The model was trained on a Tesla P100 GPU and 25GB of RAM with the following command:
+
+```bash
+!python /content/BERT-of-Theseus/run_glue.py \
+  --model_name_or_path deepset/bert-base-cased-squad2 \
+  --task_name qnli \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir /content/QNLI_dataset \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --per_gpu_eval_batch_size 32 \
+  --learning_rate 2e-5 \
+  --save_steps 2000 \
+  --num_train_epochs 50 \
+  --output_dir /content/ouput_dir \
+  --evaluate_during_training \
+  --replacing_rate 0.7 \
+  --steps_for_replacing 2500 
+```
+
+## Metrics:
+
+| Model          | Accuracy |
+|-----------------|------|
+| BERT-base       | 91.2 |
+| BERT-of-Theseus | 88.8 |
+| [bert-uncased-finetuned-qnli](https://huggingface.co/mrm8488/bert-uncased-finetuned-qnli) | 87.2
+| DistillBERT     | 85.3 |
+
+
+
+
+> [See all my models](https://huggingface.co/models?search=mrm8488)
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain

From a43c388abb949215f3f3735f19417b939eeced1b Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 4 Mar 2020 12:53:02 -0500
Subject: [PATCH 59/80] [model_cards] Add card by @djstrong

(the current way to submit a model card to have it displayed on the website is to open a PR on the `transformers` repo itself)

Thanks for sharing!
---
 model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md

diff --git a/model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md b/model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md
new file mode 100644
index 0000000000..e29aab4e36
--- /dev/null
+++ b/model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md
@@ -0,0 +1 @@
+Slavic BERT from https://github.com/deepmipt/Slavic-BERT-NER http://files.deeppavlov.ai/deeppavlov_data/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz

From 76111a3d3acbd7210229c01190cefe3e54dfa03c Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 4 Mar 2020 12:55:20 -0500
Subject: [PATCH 60/80] [model_cards] Add card by @lvwerra

(the current way to submit a model card to have it displayed on the website is to open a PR on the `transformers` repo itself)

Thanks for sharing!
---
 model_cards/lvwerra/gpt2-medium-taboo/README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 model_cards/lvwerra/gpt2-medium-taboo/README.md

diff --git a/model_cards/lvwerra/gpt2-medium-taboo/README.md b/model_cards/lvwerra/gpt2-medium-taboo/README.md
new file mode 100644
index 0000000000..c9bb56e54b
--- /dev/null
+++ b/model_cards/lvwerra/gpt2-medium-taboo/README.md
@@ -0,0 +1,12 @@
+# GPT-2 (medium) Taboo
+
+## What is it?
+A fine-tuned GPT-2 version for Taboo cards generation.
+
+## Training setting
+
+The model was trained on ~900 Taboo cards in the following format for 100 epochs:
+```
+Describe the word Glitch without using the words Problem, Unexpected, Technology, Minor, Outage.
+````
+

From 189113d8910308b9f3509c6946b2147ce57a0bf7 Mon Sep 17 00:00:00 2001
From: Manuel Romero <mrm8488@gmail.com>
Date: Wed, 4 Mar 2020 18:12:50 +0100
Subject: [PATCH 61/80] Create README.md

---
 .../README.md                                 | 137 ++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md

diff --git a/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md b/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
new file mode 100644
index 0000000000..55b5b3e9c0
--- /dev/null
+++ b/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
@@ -0,0 +1,137 @@
+---
+language: multilingual
+thumbnail:
+---
+
+# BERT (base-multilingual-cased) fine-tuned on XQuAD
+
+This model was created by [Google](https://github.com/google-research/bert/blob/master/multilingual.md) and fine-tuned on [XQuAD](https://github.com/deepmind/xquad) for multilingual (`11 different languages`) **Q&A** downstream task.
+
+## Details of the language model('bert-base-multilingual-cased')
+
+[Language model](https://github.com/google-research/bert/blob/master/multilingual.md)
+
+| Languages | Heads | Layers | Hidden | Params |
+| --------- | ----- | ------ | ------ | ------ |
+| 104       | 12    | 12     | 768    | 100 M  |
+
+## Details of the downstream task (multilingual Q&A) - Dataset
+
+Deepmind [XQuAD](https://github.com/deepmind/xquad)
+
+Languages covered:
+
+- Arabic: `ar`
+- German: `de`
+- Greek: `el`
+- English: `en`
+- Spanish: `es`
+- Hindi: `hi`
+- Russian: `ru`
+- Thai: `th`
+- Turkish: `tr`
+- Vietnamese: `vi`
+- Chinese: `zh`
+
+As the dataset is based on SQuAD v1.1, there are no unanswerable questions in the data. We chose this
+setting so that models can focus on cross-lingual transfer.
+
+We show the average number of tokens per paragraph, question, and answer for each language in the
+table below. The statistics were obtained using [Jieba](https://github.com/fxsjy/jieba) for Chinese
+and the [Moses tokenizer](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)
+for the other languages.
+
+|           |  en   |  es   |  de   |  el   |  ru   |  tr   |  ar   |  vi   |  th   |  zh   |  hi   |
+| --------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| Paragraph | 142.4 | 160.7 | 139.5 | 149.6 | 133.9 | 126.5 | 128.2 | 191.2 | 158.7 | 147.6 | 232.4 |
+| Question  | 11.5  | 13.4  | 11.0  | 11.7  | 10.0  |  9.8  | 10.7  | 14.8  | 11.5  | 10.5  | 18.7  |
+| Answer    |  3.1  |  3.6  |  3.0  |  3.3  |  3.1  |  3.1  |  3.1  |  4.5  |  4.1  |  3.5  |  5.6  |
+
+Citation:
+
+<details>
+```
+@article{Artetxe:etal:2019,
+      author    = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
+      title     = {On the cross-lingual transferability of monolingual representations},
+      journal   = {CoRR},
+      volume    = {abs/1910.11856},
+      year      = {2019},
+      archivePrefix = {arXiv},
+      eprint    = {1910.11856}
+}
+```
+</details>
+
+I used `Data augmentation techniques` and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
+
+| Dataset     | # samples |
+| ----------- | --------- |
+| XQUAD train | 50 K      |
+| XQUAD test  | 8 K       |
+
+## Model training
+
+The model was trained on a Tesla P100 GPU and 25GB of RAM.
+The script for fine tuning can be found [here](https://github.com/huggingface/transformers/blob/master/examples/distillation/run_squad_w_distillation.py)
+
+## Results:
+
+| Metric    | # Value     |
+| --------- | ----------- |
+| **Exact** | **91.43** |
+| **F1**    | **94.14** |
+
+
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="mrm8488/bert-multi-cased-finetuned-xquadv1",
+    tokenizer="mrm8488/bert-multi-cased-finetuned-xquadv1"
+)
+
+
+# context: Coronavirus is seeding panic in the West because it expands so fast.
+
+# question: Where is seeding panic Coronavirus?
+qa_pipeline({
+    'context': "कोरोनावायरस पश्चिम में आतंक बो रहा है क्योंकि यह इतनी तेजी से फैलता है।",
+    'question': "कोरोनावायरस घबराहट कहां है?"
+    
+})
+# output: {'answer': 'पश्चिम', 'end': 18, 'score': 0.7037217439689059, 'start': 12}
+
+qa_pipeline({
+    'context': "Manuel Romero has been working hardly in the repository hugginface/transformers lately",
+    'question': "Who has been working hard for hugginface/transformers lately?"
+    
+})
+# output: {'answer': 'Manuel Romero', 'end': 13, 'score': 0.7254485993702389, 'start': 0}
+
+qa_pipeline({
+    'context': "Manuel Romero a travaillé à peine dans le référentiel hugginface / transformers ces derniers temps",
+    'question': "Pour quel référentiel a travaillé Manuel Romero récemment?"
+    
+})
+#output: {'answer': 'hugginface / transformers', 'end': 79, 'score': 0.6482061613915384, 'start': 54}
+```
+![model in action](https://media.giphy.com/media/MBlire8Wj7ng73VBQ5/giphy.gif)
+
+Try it on a Colab:
+
+<a href="https://colab.research.google.com/github/mrm8488/shared_colab_notebooks/blob/master/Try_mrm8488_xquad_finetuned_model.ipynb" target="_parent"><img src="https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667" alt="Open In Colab" data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg"></a>
+
+
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain

From 1bca97ec7f8037ff6b9f855193628c5091dbf114 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Wed, 4 Mar 2020 21:19:33 +0100
Subject: [PATCH 62/80] Update notebook link and fix few working issues.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
---
 notebooks/02-transformers.ipynb | 7 +++----
 notebooks/README.md             | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/notebooks/02-transformers.ipynb b/notebooks/02-transformers.ipynb
index fcd9db55cd..44655c1e4a 100644
--- a/notebooks/02-transformers.ipynb
+++ b/notebooks/02-transformers.ipynb
@@ -32,7 +32,7 @@
     "was not well suited for the kind of hardware we're currently leveraging due to bad parallelization capabilities. \n",
     "\n",
     "Some extensions were provided by the academic community, such as Bidirectional RNN ([Schuster & Paliwal., 1997](https://www.researchgate.net/publication/3316656_Bidirectional_recurrent_neural_networks), [Graves & al., 2005](https://mediatum.ub.tum.de/doc/1290195/file.pdf)), \n",
-    "which can be seen as a concatenation of two sequential process, on going forward, the other one going backward over the sequence input.\n",
+    "which can be seen as a concatenation of two sequential process, one going forward, the other one going backward over the sequence input.\n",
     "\n",
     "![birnn](https://miro.medium.com/max/764/1*6QnPUSv_t9BY9Fv8_aLb-Q.png)\n",
     "\n",
@@ -49,8 +49,7 @@
     "on translation tasks but it quickly extended to almost all the tasks RNNs were State-of-the-Art at that time.\n",
     "\n",
     "One advantage of Transformer over its RNN counterpart was its non sequential attention model. Remember, the RNNs had to\n",
-    "iterate over each element of the input sequence one-by-one and carry an \"updatable-state\" between each hop. With Transformer\n",
-    "the, the model is able to look at every position in the sequence, at the same time, in one operation.\n",
+    "iterate over each element of the input sequence one-by-one and carry an \"updatable-state\" between each hop. With Transformer, the model is able to look at every position in the sequence, at the same time, in one operation.\n",
     "\n",
     "For a deep-dive into the Transformer architecture, [The Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html#encoder-and-decoder-stacks) \n",
     "will drive you along all the details of the paper.\n",
@@ -68,7 +67,7 @@
    "source": [
     "## Getting started with transformers\n",
     "\n",
-    "For the rest of this notebook, we will use a BERT model, as it's the most simple and there are plenty of content about it\n",
+    "For the rest of this notebook, we will use the [BERT (Devlin & al., 2018)](https://arxiv.org/abs/1810.04805) architecture, as it's the most simple and there are plenty of content about it\n",
     "over the internet, it will be easy to dig more over this architecture if you want to.\n",
     "\n",
     "The transformers library allows you to benefits from large, pretrained language models without requiring a huge and costly computational\n",
diff --git a/notebooks/README.md b/notebooks/README.md
index a515fb1347..9a7d3a4511 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -11,7 +11,7 @@ Pull Request and we'll review it so it can be included here.
 
 | Notebook     |      Description      |   |
 |:----------|:-------------:|------:|
-| [Getting Started Tokenizers](01-training_tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
+| [Getting Started Tokenizers](01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
 | [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
 | [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
 | [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|

From c440030e99a8ebd3974894040f047a49854b8f69 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 4 Mar 2020 16:33:10 -0500
Subject: [PATCH 63/80] [model_cards] Tag AR model languages

---
 model_cards/asafaya/bert-base-arabic/README.md        | 4 ++++
 model_cards/aubmindlab/bert-base-arabert/README.md    | 4 ++++
 model_cards/aubmindlab/bert-base-arabertv01/README.md | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/model_cards/asafaya/bert-base-arabic/README.md b/model_cards/asafaya/bert-base-arabic/README.md
index fc071dfae0..2c370537c9 100644
--- a/model_cards/asafaya/bert-base-arabic/README.md
+++ b/model_cards/asafaya/bert-base-arabic/README.md
@@ -1,3 +1,7 @@
+---
+language: arabic
+---
+
 # Arabic BERT Model
 
 Pretrained BERT base language model for Arabic
diff --git a/model_cards/aubmindlab/bert-base-arabert/README.md b/model_cards/aubmindlab/bert-base-arabert/README.md
index 75d967fcd0..a9ce0fd57e 100644
--- a/model_cards/aubmindlab/bert-base-arabert/README.md
+++ b/model_cards/aubmindlab/bert-base-arabert/README.md
@@ -1,3 +1,7 @@
+---
+language: arabic
+---
+
 # AraBERT : Pre-training BERT for Arabic Language Understanding
 
 **AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
diff --git a/model_cards/aubmindlab/bert-base-arabertv01/README.md b/model_cards/aubmindlab/bert-base-arabertv01/README.md
index 75d967fcd0..a9ce0fd57e 100644
--- a/model_cards/aubmindlab/bert-base-arabertv01/README.md
+++ b/model_cards/aubmindlab/bert-base-arabertv01/README.md
@@ -1,3 +1,7 @@
+---
+language: arabic
+---
+
 # AraBERT : Pre-training BERT for Arabic Language Understanding
 
 **AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.

From 07a79db505253cd8196c2d00ad2ba498e8514944 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Mar 2020 19:11:31 -0500
Subject: [PATCH 64/80] Fix failing doc samples

---
 docs/source/multilingual.rst          | 4 ++++
 src/transformers/modeling_flaubert.py | 5 ++++-
 tests/test_doc_samples.py             | 8 ++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/docs/source/multilingual.rst b/docs/source/multilingual.rst
index f6f72b2434..781222962b 100644
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -47,6 +47,7 @@ The different languages this model/tokenizer handles, as well as the ids of thes
 
 .. code-block::
 
+    # Continuation of the previous script
     print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
 
 
@@ -54,6 +55,7 @@ These ids should be used when passing a language parameter during a model pass.
 
 .. code-block::
 
+    # Continuation of the previous script
     input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
 
 
@@ -62,6 +64,7 @@ filled with the appropriate language ids, of the same size as input_ids. For eng
 
 .. code-block::
 
+    # Continuation of the previous script
     language_id = tokenizer.lang2id['en']  # 0
     langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
 
@@ -73,6 +76,7 @@ You can then feed it all as input to your model:
 
 .. code-block::
 
+    # Continuation of the previous script
     outputs = model(input_ids, langs=langs)
 
 
diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py
index 56c3ce17a9..7236e44a16 100644
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -148,9 +148,12 @@ class FlaubertModel(XLMModel):
 
     Examples::
 
+        from transformers import FlaubertTokenizer, FlaubertModel
+        import torch
+
         tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
         model = FlaubertModel.from_pretrained('flaubert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
diff --git a/tests/test_doc_samples.py b/tests/test_doc_samples.py
index c97af35200..efed437d9d 100644
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -78,6 +78,7 @@ class TestCodeExamples(unittest.TestCase):
 
         for file in files:
             # Open all files
+            print("Testing", file, end=" ")
             with open(os.path.join(directory, file)) as f:
                 # Retrieve examples
                 examples = get_examples_from_file(f)
@@ -99,7 +100,7 @@ class TestCodeExamples(unittest.TestCase):
                             joined_examples.append(example)
                             joined_examples_index += 1
 
-                print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
+                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
 
                 # Execute sub tests with every example.
                 for index, code_example in enumerate(joined_examples):
@@ -114,7 +115,8 @@ class TestCodeExamples(unittest.TestCase):
 
     def test_main_doc_examples(self):
         doc_directory = "docs/source"
-        self.analyze_directory(doc_directory)
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_directory, ignore_files=ignore_files)
 
     def test_modeling_examples(self):
         transformers_directory = "src/transformers"
@@ -125,5 +127,7 @@ class TestCodeExamples(unittest.TestCase):
             "modeling_tf_auto.py",
             "modeling_utils.py",
             "modeling_tf_t5.py",
+            "modeling_bart.py",
+            "modeling_tf_utils.py"
         ]
         self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)

From ff9e79ba3a3dd35c1a7edbd669cf78e082b2f7dc Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 4 Mar 2020 20:18:07 -0500
Subject: [PATCH 65/80] make style

---
 tests/test_doc_samples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_doc_samples.py b/tests/test_doc_samples.py
index efed437d9d..9861b2b550 100644
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -128,6 +128,6 @@ class TestCodeExamples(unittest.TestCase):
             "modeling_utils.py",
             "modeling_tf_t5.py",
             "modeling_bart.py",
-            "modeling_tf_utils.py"
+            "modeling_tf_utils.py",
         ]
         self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)

From 30624f7056ae3b607ba1d02f474f2c7986e87dff Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Thu, 5 Mar 2020 11:40:15 +0100
Subject: [PATCH 66/80] Fix Colab links + install dependencies first.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
---
 notebooks/01-training-tokenizers.ipynb | 210 +++++++++++++------------
 notebooks/02-transformers.ipynb        |  14 ++
 notebooks/03-pipelines.ipynb           |  14 ++
 notebooks/README.md                    |   6 +-
 4 files changed, 138 insertions(+), 106 deletions(-)

diff --git a/notebooks/01-training-tokenizers.ipynb b/notebooks/01-training-tokenizers.ipynb
index 554d25d3ff..1a56594961 100644
--- a/notebooks/01-training-tokenizers.ipynb
+++ b/notebooks/01-training-tokenizers.ipynb
@@ -2,6 +2,12 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "## Tokenization doesn't have to be slow !\n",
     "\n",
@@ -81,34 +87,46 @@
     "\n",
     "All of these building blocks can be combined to create working tokenization pipelines. \n",
     "In the next section we will go over our first pipeline."
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "Alright, now we are ready to implement our first tokenization pipeline through `tokenizers`. \n",
     "\n",
     "For this, we will train a Byte-Pair Encoding (BPE) tokenizer on a quite small input for the purpose of this notebook.\n",
-    "We will work with [the file from peter Norving](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwjYp9Ppru_nAhUBzIUKHfbUAG8QFjAAegQIBhAB&url=https%3A%2F%2Fnorvig.com%2Fbig.txt&usg=AOvVaw2ed9iwhcP1RKUiEROs15Dz).\n",
-    "This file contains around 130.000 lines of raw text that will be processed by the library to generate a working tokenizer."
-   ],
+    "We will work with [the file from Peter Norving](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwjYp9Ppru_nAhUBzIUKHfbUAG8QFjAAegQIBhAB&url=https%3A%2F%2Fnorvig.com%2Fbig.txt&usg=AOvVaw2ed9iwhcP1RKUiEROs15Dz).\n",
+    "This file contains around 130.000 lines of raw text that will be processed by the library to generate a working tokenizer.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
-    "collapsed": false,
     "pycharm": {
-     "name": "#%% md\n"
+     "is_executing": false,
+     "name": "#%% code\n"
     }
-   }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install tokenizers"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
    "outputs": [],
    "source": [
     "BIG_FILE_URL = 'https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt'\n",
@@ -122,33 +140,31 @@
     "        big_f.write(response.content)\n",
     "    else:\n",
     "        print(\"Unable to get the file: {}\".format(response.reason))\n"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% code\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     " \n",
     "Now that we have our training data we need to create the overall pipeline for the tokenizer\n",
     " "
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": 10,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
    "outputs": [],
    "source": [
     "# For the user's convenience `tokenizers` provides some very high-level classes encapsulating\n",
@@ -165,49 +181,47 @@
     "tokenizer = Tokenizer(BPE.empty())\n",
     "\n",
     "# Then we enable lower-casing and unicode-normalization\n",
-    "# The Sequence normalizer allows us to combine multiple Normalizer, that will be\n",
-    "# executed in sequence.\n",
+    "# The Sequence normalizer allows us to combine multiple Normalizer that will be\n",
+    "# executed in order.\n",
     "tokenizer.normalizer = Sequence([\n",
     "    NFKC(),\n",
     "    Lowercase()\n",
     "])\n",
     "\n",
-    "# Out tokenizer also needs a pre-tokenizer responsible for converting the input to a ByteLevel representation.\n",
+    "# Our tokenizer also needs a pre-tokenizer responsible for converting the input to a ByteLevel representation.\n",
     "tokenizer.pre_tokenizer = ByteLevel()\n",
     "\n",
     "# And finally, let's plug a decoder so we can recover from a tokenized input to the original one\n",
     "tokenizer.decoder = ByteLevelDecoder()"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% code\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "The overall pipeline is now ready to be trained on the corpus we downloaded earlier in this notebook."
-   ],
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%% md\n"
     }
-   }
+   },
+   "source": [
+    "The overall pipeline is now ready to be trained on the corpus we downloaded earlier in this notebook."
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Trained vocab size: 25000\n"
-     ],
-     "output_type": "stream"
+     ]
     }
    ],
    "source": [
@@ -218,79 +232,77 @@
     "tokenizer.train(trainer, [\"big.txt\"])\n",
     "\n",
     "print(\"Trained vocab size: {}\".format(tokenizer.get_vocab_size()))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% code\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "Et voilà ! You trained your very first tokenizer from scratch using `tokenizers`. Of course, this \n",
     "covers only the basics, and you may want to have a look at the `add_special_tokens` or `special_tokens` parameters\n",
     "on the `Trainer` class, but the overall process should be very similar.\n",
     "\n",
     "We can save the content of the model to reuse it later."
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": 12,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
    "outputs": [
     {
      "data": {
-      "text/plain": "['./vocab.json', './merges.txt']"
+      "text/plain": [
+       "['./vocab.json', './merges.txt']"
+      ]
      },
+     "execution_count": 12,
      "metadata": {},
-     "output_type": "execute_result",
-     "execution_count": 12
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "# You will see the generated files in the output.\n",
     "tokenizer.model.save('.')"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% code\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "Now, let load the trained model and start using out newly trained tokenizer"
-   ],
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%% md\n"
     }
-   }
+   },
+   "source": [
+    "Now, let load the trained model and start using out newly trained tokenizer"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": 13,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%% code\n"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Encoded string: ['Ġthis', 'Ġis', 'Ġa', 'Ġsimple', 'Ġin', 'put', 'Ġto', 'Ġbe', 'Ġtoken', 'ized']\n",
       "Decoded string:  this is a simple input to be tokenized\n"
-     ],
-     "output_type": "stream"
+     ]
     }
    ],
    "source": [
@@ -302,17 +314,15 @@
     "\n",
     "decoded = tokenizer.decode(encoding.ids)\n",
     "print(\"Decoded string: {}\".format(decoded))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% code\n",
-     "is_executing": false
-    }
-   }
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "The Encoding structure exposes multiple properties which are useful when working with transformers models\n",
     "\n",
@@ -324,13 +334,7 @@
     "- special_token_mask: If your input contains special tokens such as [CLS], [SEP], [MASK], [PAD], then this would be a vector with 1 in places where a special token has been added.\n",
     "- type_ids: If your was made of multiple \"parts\" such as (question, context), then this would be a vector with for each token the segment it belongs to.\n",
     "- overflowing: If your has been truncated into multiple subparts because of a length limit (for BERT for example the sequence length is limited to 512), this will contain all the remaining overflowing parts."
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   }
+   ]
   }
  ],
  "metadata": {
@@ -342,25 +346,25 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
   },
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
+ "nbformat_minor": 1
+}
diff --git a/notebooks/02-transformers.ipynb b/notebooks/02-transformers.ipynb
index 44655c1e4a..e02d19c5a6 100644
--- a/notebooks/02-transformers.ipynb
+++ b/notebooks/02-transformers.ipynb
@@ -75,6 +75,20 @@
     "in PyTorch and TensorFlow in a transparent and interchangeable way. "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "!pip install transformers"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n"
+    }
+   }
+  },
   {
    "cell_type": "code",
    "execution_count": 74,
diff --git a/notebooks/03-pipelines.ipynb b/notebooks/03-pipelines.ipynb
index 9a5b3f7c4f..ddaffcee06 100644
--- a/notebooks/03-pipelines.ipynb
+++ b/notebooks/03-pipelines.ipynb
@@ -51,6 +51,20 @@
     "```"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "!pip install transformers"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% code\n"
+    }
+   }
+  },
   {
    "cell_type": "code",
    "execution_count": 29,
diff --git a/notebooks/README.md b/notebooks/README.md
index 9a7d3a4511..234a6cf8ed 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -11,7 +11,7 @@ Pull Request and we'll review it so it can be included here.
 
 | Notebook     |      Description      |   |
 |:----------|:-------------:|------:|
-| [Getting Started Tokenizers](01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
-| [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
-| [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/docker-notebooks/notebooks/01-training-tokenizers.ipynb) |
+| [Getting Started Tokenizers](01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
+| [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
+| [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
 | [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|

From c47394b0c9ce5ef360bee9efd07779afe06f48a8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 5 Mar 2020 13:12:50 +0100
Subject: [PATCH 67/80] refactoring and bug fixing beam search generate

---
 src/transformers/modeling_utils.py | 80 ++++++++++++++++++------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 435e6c132b..8052c1cf50 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -15,11 +15,11 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-
 import logging
 import os
 import typing
 
+import ipdb
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
@@ -758,6 +758,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         else:
             assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
 
+        # not allow to duplicate outputs when greedy decoding
         if do_sample is False:
             if num_beams == 1:
                 # no_beam_search greedy generation conditions
@@ -781,15 +782,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         cur_len = input_ids.shape[1]
         vocab_size = self.config.vocab_size
 
-        if num_return_sequences != 1 and do_sample:
-            # Expand input to num return sequences
-            input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
-            input_ids = input_ids.contiguous().view(
-                batch_size * num_return_sequences, cur_len
-            )  # shape: (batch_size * num_return_sequences, cur_len)
+        # set effective batch size and effective batch multiplier according to do_sample
+        if do_sample:
             effective_batch_size = batch_size * num_return_sequences
+            effective_batch_mult = num_return_sequences
         else:
             effective_batch_size = batch_size
+            effective_batch_mult = 1
+
+        # Expand input ids if num_beams > 1 or num_return_sequences > 1
+        if num_return_sequences > 1 or num_beams > 1:
+            input_ids_len = input_ids.shape[-1]
+            input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len)
+            input_ids = input_ids.contiguous().view(
+                effective_batch_size * num_beams, input_ids_len
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
 
         if num_beams > 1:
             output = self._generate_beam_search(
@@ -892,12 +899,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                     # unfinished_sents is set to zero if eos in sentence
                     unfinished_sents.mul_((~eos_in_sents).long())
 
-            cur_len = cur_len + 1
-
             # stop when there is a </s> in each sentence, or if we exceed the maximul length
             if unfinished_sents.max() == 0:
                 break
 
+            cur_len = cur_len + 1
+
         # if there are different sentences lengths in the batch, some batches have to be padded
         if sent_lengths.min().item() != sent_lengths.max().item():
             assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
@@ -932,10 +939,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         """ Generate sequences for each example with beam search.
         """
 
-        # Expand input to num beams
-        input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
-        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
-
         # generated hypotheses
         generated_hyps = [
             BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)
@@ -945,8 +948,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
 
         # Greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
-        if do_sample is False:
-            beam_scores[:, 1:] = -1e9
+        #        if do_sample is False:
+        beam_scores[:, 1:] = -1e9
         beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
 
         # cache compute states
@@ -996,6 +999,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 # Compute next scores
                 next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
 
+                # sort the sampled vector to make sure that the first num_beams samples are the best
+                next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1)
+                next_tokens = torch.gather(next_tokens, -1, next_scores_indices)  # (batch_size, num_beams * 2)
             else:
                 # do greedy beam search
                 scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
@@ -1006,6 +1012,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 next_scores = next_scores.view(
                     batch_size, num_beams * vocab_size
                 )  # (batch_size, num_beams * vocab_size)
+
                 next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
 
             assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
@@ -1041,14 +1048,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                     beam_id = idx // vocab_size
                     token_id = idx % vocab_size
 
-                    # add to generated hypotheses if end of sentence or last iteration
+                    effective_beam_id = batch_idx * num_beams + beam_id
+                    # add to generated hypotheses if end of sentence
                     if eos_token_ids is not None and token_id.item() in eos_token_ids:
                         generated_hyps[batch_idx].add(
-                            input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item(),
+                            input_ids[effective_beam_id].clone(), score.item(),
                         )
                     else:
                         # add next predicted word if it is not eos_token
-                        next_sent_beam.append((score, token_id, batch_idx * num_beams + beam_id))
+                        next_sent_beam.append((score, token_id, effective_beam_id))
 
                     # the beam for next step is full
                     if len(next_sent_beam) == num_beams:
@@ -1073,24 +1081,34 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
             if past:
                 past = self._reorder_cache(past, beam_idx)
 
-            # update current length
-            cur_len = cur_len + 1
-
             # stop when we are done with each sentence
             if all(done):
                 break
 
-        for batch_idx in range(batch_size):
-            # Add all open beam hypothesis to generated_hyps
-            if not done[batch_idx]:
-                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
+            # update current length
+            cur_len = cur_len + 1
 
-                    # get beam and word IDs
-                    beam_id = idx // vocab_size
-                    token_id = idx % vocab_size
-                    generated_hyps[batch_idx].add(
-                        input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item()
-                    )
+        # finalize all open beam hypotheses and end to generated hypotheses
+        for batch_idx in range(batch_size):
+            if done[batch_idx]:
+                continue
+
+            # test that beam scores match previously calculated scores if not eos and batch_idx not done
+            if eos_token_ids is not None and all(
+                (token_id % vocab_size).item() not in eos_token_ids for token_id in next_tokens[batch_idx]
+            ):
+                assert torch.all(
+                    next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
+                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
+                    next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx]
+                )
+
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(num_beams):
+                effective_beam_id = batch_idx * num_beams + beam_id
+                final_score = beam_scores[effective_beam_id].item()
+                final_tokens = input_ids[effective_beam_id]
+                generated_hyps[batch_idx].add(final_tokens, final_score)
 
         # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
         output_batch_size = batch_size if do_sample else batch_size * num_return_sequences

From 4220fd52b92cd96da6ac461c8b6e99912dca8dfc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 5 Mar 2020 13:36:21 +0100
Subject: [PATCH 68/80] remove ipdb

---
 src/transformers/modeling_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 8052c1cf50..60b4fa53ab 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -19,7 +19,6 @@ import logging
 import os
 import typing
 
-import ipdb
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss

From e33ed12c3b45677faf8d64dd42aa9cd5d8630a55 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 5 Mar 2020 13:41:04 +0100
Subject: [PATCH 69/80] uncomment expression

---
 src/transformers/modeling_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 60b4fa53ab..7dd0e873dc 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -945,10 +945,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
         # scores for each sentence in the beam
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-
         # Greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
-        #        if do_sample is False:
-        beam_scores[:, 1:] = -1e9
+        if do_sample is False:
+            beam_scores[:, 1:] = -1e9
         beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
 
         # cache compute states

From 012cbdb0f5759a89b357027a84590d6df3007fcf Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Thu, 5 Mar 2020 15:34:15 +0100
Subject: [PATCH 70/80] Updating colab links in notebooks README.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
---
 notebooks/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/README.md b/notebooks/README.md
index 234a6cf8ed..832df1ad6a 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -12,6 +12,6 @@ Pull Request and we'll review it so it can be included here.
 | Notebook     |      Description      |   |
 |:----------|:-------------:|------:|
 | [Getting Started Tokenizers](01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
-| [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
-| [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
+| [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb) |
+| [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb) |
 | [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|

From 8a2d9bc9ef38452e80ce872505a5ad5623c12657 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 5 Mar 2020 09:34:43 -0500
Subject: [PATCH 71/80] Add model cards for DeepPavlov models (#3138)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add empty model cards for every current DeepPavlov model

* fix: replace cyrillic `с` with `c`

* docs: add model cards for current DeepPavlov BERT models

* docs: add links for arXiv preprints
---
 .../bert-base-bg-cs-pl-ru-cased/README.md     | 18 +++++++++++++++
 .../bert-base-cased-conversational/README.md  | 23 +++++++++++++++++++
 .../README.md                                 | 22 ++++++++++++++++++
 .../README.md                                 | 18 +++++++++++++++
 .../rubert-base-cased-sentence/README.md      | 21 +++++++++++++++++
 .../DeepPavlov/rubert-base-cased/README.md    | 14 +++++++++++
 6 files changed, 116 insertions(+)
 create mode 100644 model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md
 create mode 100644 model_cards/DeepPavlov/bert-base-cased-conversational/README.md
 create mode 100644 model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md
 create mode 100644 model_cards/DeepPavlov/rubert-base-cased-conversational/README.md
 create mode 100644 model_cards/DeepPavlov/rubert-base-cased-sentence/README.md
 create mode 100644 model_cards/DeepPavlov/rubert-base-cased/README.md

diff --git a/model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md b/model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md
new file mode 100644
index 0000000000..7e4aa0c461
--- /dev/null
+++ b/model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md
@@ -0,0 +1,18 @@
+---
+language:
+- bulgarian
+- czech
+- polish
+- russian
+---
+
+# bert-base-bg-cs-pl-ru-cased
+
+SlavicBERT\[1\] \(Slavic \(bg, cs, pl, ru\), cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained
+on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian.
+Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT.
+
+
+\[1\]: Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. \(2019\).
+[Tuning Multilingual Transformers for Language-Specific Named Entity Recognition](https://www.aclweb.org/anthology/W19-3712/).
+ACL anthology W19-3712.
diff --git a/model_cards/DeepPavlov/bert-base-cased-conversational/README.md b/model_cards/DeepPavlov/bert-base-cased-conversational/README.md
new file mode 100644
index 0000000000..357527d232
--- /dev/null
+++ b/model_cards/DeepPavlov/bert-base-cased-conversational/README.md
@@ -0,0 +1,23 @@
+---
+language:
+- english
+---
+
+# bert-base-cased-conversational
+
+Conversational BERT \(English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters\) was trained
+on the English part of Twitter, Reddit, DailyDialogues\[1\], OpenSubtitles\[2\], Debates\[3\], Blogs\[4\],
+Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took
+English cased version of BERT-base as an initialization for English Conversational BERT.
+
+
+\[1\]: Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled
+Multi-turn Dialogue Dataset. IJCNLP 2017.
+
+\[2\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.
+In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
+
+\[3\]: Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.
+
+\[4\]: J. Schler, M. Koppel, S. Argamon and J. Pennebaker \(2006\). Effects of Age and Gender on Blogging
+in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.
diff --git a/model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md b/model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md
new file mode 100644
index 0000000000..1e07210e77
--- /dev/null
+++ b/model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md
@@ -0,0 +1,22 @@
+---
+language:
+- multilingual
+---
+
+# bert-base-multilingual-cased-sentence
+
+Sentence Multilingual BERT \(101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\)
+is a representation-based sentence encoder for 101 languages of Multilingual BERT.
+It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI\[1\] and on dev set
+of multilingual XNLI\[2\].
+Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT\[3\].
+
+
+\[1\]: Williams A., Nangia N. & Bowman S. \(2017\) A Broad-Coverage Challenge Corpus for Sentence Understanding
+through Inference. arXiv preprint [arXiv:1704.05426](https://arxiv.org/abs/1704.05426)
+
+\[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations.
+arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
+
+\[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
+arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
diff --git a/model_cards/DeepPavlov/rubert-base-cased-conversational/README.md b/model_cards/DeepPavlov/rubert-base-cased-conversational/README.md
new file mode 100644
index 0000000000..4ea20c2cd1
--- /dev/null
+++ b/model_cards/DeepPavlov/rubert-base-cased-conversational/README.md
@@ -0,0 +1,18 @@
+---
+language:
+- russian
+---
+
+# rubert-base-cased-conversational
+
+Conversational RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained
+on OpenSubtitles\[1\], [Dirty](https://d3.ru/), [Pikabu](https://pikabu.ru/),
+and a Social Media segment of Taiga corpus\[2\]. We assembled a new vocabulary for Conversational RuBERT model
+on this data and initialized the model with [RuBERT](../rubert-base-cased).
+
+
+\[1\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.
+In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
+
+\[2\]: Shavrina T., Shapovalova O. \(2017\) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING:
+«TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017.
diff --git a/model_cards/DeepPavlov/rubert-base-cased-sentence/README.md b/model_cards/DeepPavlov/rubert-base-cased-sentence/README.md
new file mode 100644
index 0000000000..9bac38460f
--- /dev/null
+++ b/model_cards/DeepPavlov/rubert-base-cased-sentence/README.md
@@ -0,0 +1,21 @@
+---
+language:
+- russian
+---
+
+# rubert-base-cased-sentence
+
+Sentence RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\)
+is a representation-based sentence encoder for Russian. It is initialized with RuBERT and fine-tuned on SNLI\[1\]
+google-translated to russian and on russian part of XNLI dev set\[2\]. Sentence representations are mean pooled
+token embeddings in the same manner as in Sentence-BERT\[3\].
+
+
+\[1\]: S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. \(2015\) A large annotated corpus for learning
+natural language inference. arXiv preprint [arXiv:1508.05326](https://arxiv.org/abs/1508.05326)
+
+\[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations.
+arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
+
+\[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
+arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
diff --git a/model_cards/DeepPavlov/rubert-base-cased/README.md b/model_cards/DeepPavlov/rubert-base-cased/README.md
new file mode 100644
index 0000000000..36e12cdeff
--- /dev/null
+++ b/model_cards/DeepPavlov/rubert-base-cased/README.md
@@ -0,0 +1,14 @@
+---
+language:
+- russian
+---
+
+# rubert-base-cased
+
+RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained on the Russian part of Wikipedia
+and news data. We used this training data to build a vocabulary of Russian subtokens and took a multilingual version
+of BERT-base as an initialization for RuBERT\[1\].
+
+
+\[1\]: Kuratov, Y., Arkhipov, M. \(2019\). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language.
+arXiv preprint [arXiv:1905.07213](https://arxiv.org/abs/1905.07213).

From be02176a4b70570049a00427f61132c8e897da19 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Thu, 5 Mar 2020 16:00:38 +0100
Subject: [PATCH 72/80] Fixing sentiment pipeline in 03-pipelines notebook.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
---
 notebooks/03-pipelines.ipynb | 239 +++++++++++------------------------
 1 file changed, 71 insertions(+), 168 deletions(-)

diff --git a/notebooks/03-pipelines.ipynb b/notebooks/03-pipelines.ipynb
index ddaffcee06..483fbe758f 100644
--- a/notebooks/03-pipelines.ipynb
+++ b/notebooks/03-pipelines.ipynb
@@ -67,27 +67,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 6,
    "metadata": {
     "pycharm": {
      "is_executing": false,
      "name": "#%% code \n"
     }
    },
-   "outputs": [
-    {
-     "ename": "SyntaxError",
-     "evalue": "from __future__ imports must occur at the beginning of the file (<ipython-input-29-c3a037bd4c55>, line 5)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-29-c3a037bd4c55>\"\u001b[0;36m, line \u001b[0;32m5\u001b[0m\n\u001b[0;31m    from transformers import pipeline\u001b[0m\n\u001b[0m           ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m from __future__ imports must occur at the beginning of the file\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import numpy as np\n",
     "from __future__ import print_function\n",
-    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
     "import ipywidgets as widgets\n",
     "from transformers import pipeline"
    ]
@@ -105,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -115,40 +104,35 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6aeccfdf51994149bdd1f3d3533e380f",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
-      ]
+       "version_minor": 0,
+       "model_id": "c9db53f30b9446c0af03268633a966c0"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
       "\n"
-     ]
+     ],
+     "output_type": "stream"
     },
     {
      "data": {
-      "text/plain": [
-       "[{'label': 'POSITIVE', 'score': 0.800251},\n",
-       " {'label': 'NEGATIVE', 'score': 1.2489903}]"
-      ]
+      "text/plain": "[{'label': 'POSITIVE', 'score': 0.9997656}]"
      },
-     "execution_count": 6,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "execute_result",
+     "execution_count": 8
     }
    ],
    "source": [
     "nlp_sentence_classif = pipeline('sentiment-analysis')\n",
-    "nlp_sentence_classif(['Such a nice weather outside !', 'This movie was kind of boring.'])"
+    "nlp_sentence_classif('Such a nice weather outside !')"
    ]
   },
   {
@@ -164,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 9,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -174,40 +158,30 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b5549c53c27346a899af553c977f00bc",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
-      ]
+       "version_minor": 0,
+       "model_id": "1e300789e22644f1aed66a5ed60e75c4"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
       "\n"
-     ]
+     ],
+     "output_type": "stream"
     },
     {
      "data": {
-      "text/plain": [
-       "[{'word': 'Hu', 'score': 0.9970937967300415, 'entity': 'I-ORG'},\n",
-       " {'word': '##gging', 'score': 0.9345750212669373, 'entity': 'I-ORG'},\n",
-       " {'word': 'Face', 'score': 0.9787060022354126, 'entity': 'I-ORG'},\n",
-       " {'word': 'French', 'score': 0.9981995820999146, 'entity': 'I-MISC'},\n",
-       " {'word': 'New', 'score': 0.9983047246932983, 'entity': 'I-LOC'},\n",
-       " {'word': '-', 'score': 0.8913455009460449, 'entity': 'I-LOC'},\n",
-       " {'word': 'York', 'score': 0.9979523420333862, 'entity': 'I-LOC'}]"
-      ]
+      "text/plain": "[{'word': 'Hu', 'score': 0.9970937967300415, 'entity': 'I-ORG'},\n {'word': '##gging', 'score': 0.9345750212669373, 'entity': 'I-ORG'},\n {'word': 'Face', 'score': 0.9787060022354126, 'entity': 'I-ORG'},\n {'word': 'French', 'score': 0.9981995820999146, 'entity': 'I-MISC'},\n {'word': 'New', 'score': 0.9983047246932983, 'entity': 'I-LOC'},\n {'word': '-', 'score': 0.8913455009460449, 'entity': 'I-LOC'},\n {'word': 'York', 'score': 0.9979523420333862, 'entity': 'I-LOC'}]"
      },
-     "execution_count": 16,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "execute_result",
+     "execution_count": 9
     }
    ],
    "source": [
@@ -224,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 10,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -234,42 +208,38 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6e56a8edcef44ec2ae838711ecd22d3a",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
-      ]
+       "version_minor": 0,
+       "model_id": "82aca58f1ea24b4cb37f16402e8a5923"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
       "\n"
-     ]
+     ],
+     "output_type": "stream"
     },
     {
      "name": "stderr",
-     "output_type": "stream",
      "text": [
-      "convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 53.05it/s]\n",
-      "add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2673.23it/s]\n"
-     ]
+      "convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 225.51it/s]\n",
+      "add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2158.67it/s]\n"
+     ],
+     "output_type": "stream"
     },
     {
      "data": {
-      "text/plain": [
-       "{'score': 0.9632966867654424, 'start': 42, 'end': 50, 'answer': 'New-York.'}"
-      ]
+      "text/plain": "{'score': 0.9632966867654424, 'start': 42, 'end': 50, 'answer': 'New-York.'}"
      },
-     "execution_count": 18,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "execute_result",
+     "execution_count": 10
     }
    ],
    "source": [
@@ -286,7 +256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 11,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -296,48 +266,30 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1930695ea2d24ca98c6d7c13842d377f",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
-      ]
+       "version_minor": 0,
+       "model_id": "49df2227b4fa4eb28dcdcfc3d9261d0f"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
       "\n"
-     ]
+     ],
+     "output_type": "stream"
     },
     {
      "data": {
-      "text/plain": [
-       "[{'sequence': '<s> Hugging Face is a French company based in Paris</s>',\n",
-       "  'score': 0.25288480520248413,\n",
-       "  'token': 2201},\n",
-       " {'sequence': '<s> Hugging Face is a French company based in Lyon</s>',\n",
-       "  'score': 0.07639515399932861,\n",
-       "  'token': 12790},\n",
-       " {'sequence': '<s> Hugging Face is a French company based in Brussels</s>',\n",
-       "  'score': 0.055500105023384094,\n",
-       "  'token': 6497},\n",
-       " {'sequence': '<s> Hugging Face is a French company based in Geneva</s>',\n",
-       "  'score': 0.04264815151691437,\n",
-       "  'token': 11559},\n",
-       " {'sequence': '<s> Hugging Face is a French company based in France</s>',\n",
-       "  'score': 0.03868963569402695,\n",
-       "  'token': 1470}]"
-      ]
+      "text/plain": "[{'sequence': '<s> Hugging Face is a French company based in Paris</s>',\n  'score': 0.23106691241264343,\n  'token': 2201},\n {'sequence': '<s> Hugging Face is a French company based in Lyon</s>',\n  'score': 0.0819825753569603,\n  'token': 12790},\n {'sequence': '<s> Hugging Face is a French company based in Geneva</s>',\n  'score': 0.04769463092088699,\n  'token': 11559},\n {'sequence': '<s> Hugging Face is a French company based in Brussels</s>',\n  'score': 0.047622501850128174,\n  'token': 6497},\n {'sequence': '<s> Hugging Face is a French company based in France</s>',\n  'score': 0.04130595177412033,\n  'token': 1470}]"
      },
-     "execution_count": 20,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "execute_result",
+     "execution_count": 11
     }
    ],
    "source": [
@@ -354,7 +306,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 12,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -364,34 +316,30 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "92fa4d67290f49a3943dc0abd7529892",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…"
-      ]
+       "version_minor": 0,
+       "model_id": "2af4cfb19e3243dda014d0f56b48f4b2"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
      "name": "stdout",
-     "output_type": "stream",
      "text": [
       "\n"
-     ]
+     ],
+     "output_type": "stream"
     },
     {
      "data": {
-      "text/plain": [
-       "(1, 12, 768)"
-      ]
+      "text/plain": "(1, 12, 768)"
      },
-     "execution_count": 32,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "execute_result",
+     "execution_count": 12
     }
    ],
    "source": [
@@ -417,7 +365,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 13,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -427,41 +375,27 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "Dropdown(description='Task:', index=1, options=('sentiment-analysis', 'ner', 'fill_mask'), value='ner')",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "261ae9fa30e84d1d84a3b0d9682ac477",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Dropdown(description='Task:', index=1, options=('sentiment-analysis', 'ner', 'fill_mask'), value='ner')"
-      ]
+       "version_minor": 0,
+       "model_id": "10bac065d46f4e4d9a8498dcc8104ecd"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
      "data": {
+      "text/plain": "Text(value='', description='Your input:', placeholder='Enter something')",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ddc51b71c6eb40e5ab60998664e6a857",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Text(value='', description='Your input:', placeholder='Enter something')"
-      ]
+       "version_minor": 0,
+       "model_id": "2c5f1411f7a94714bc00f01b0e3b27b2"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'word': 'Paris', 'score': 0.9991844296455383, 'entity': 'I-LOC'}]\n",
-      "[{'sequence': '<s> I\\'m from Paris.\"</s>', 'score': 0.224044069647789, 'token': 72}, {'sequence': \"<s> I'm from Paris.)</s>\", 'score': 0.16959427297115326, 'token': 1592}, {'sequence': \"<s> I'm from Paris.]</s>\", 'score': 0.10994981974363327, 'token': 21838}, {'sequence': '<s> I\\'m from Paris!\"</s>', 'score': 0.0706234946846962, 'token': 2901}, {'sequence': \"<s> I'm from Paris.</s>\", 'score': 0.0698278620839119, 'token': 4}]\n",
-      "[{'sequence': \"<s> I'm from Paris and London</s>\", 'score': 0.12238534539937973, 'token': 928}, {'sequence': \"<s> I'm from Paris and Brussels</s>\", 'score': 0.07107886672019958, 'token': 6497}, {'sequence': \"<s> I'm from Paris and Belgium</s>\", 'score': 0.040912602096796036, 'token': 7320}, {'sequence': \"<s> I'm from Paris and Berlin</s>\", 'score': 0.039884064346551895, 'token': 5459}, {'sequence': \"<s> I'm from Paris and Melbourne</s>\", 'score': 0.038133684545755386, 'token': 5703}]\n",
-      "[{'sequence': '<s> I like go to sleep</s>', 'score': 0.08942786604166031, 'token': 3581}, {'sequence': '<s> I like go to bed</s>', 'score': 0.07789064943790436, 'token': 3267}, {'sequence': '<s> I like go to concerts</s>', 'score': 0.06356740742921829, 'token': 12858}, {'sequence': '<s> I like go to school</s>', 'score': 0.03660670667886734, 'token': 334}, {'sequence': '<s> I like go to dinner</s>', 'score': 0.032155368477106094, 'token': 3630}]\n"
-     ]
     }
    ],
    "source": [
@@ -498,7 +432,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 14,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -508,46 +442,15 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "Textarea(value='Einstein is famous for the general theory of relativity', description='Context:', placeholder=…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ae68677bd8a41f990355aa43840d3f8",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Textarea(value='Einstein is famous for the general theory of relativity', description='Context:', placeholder=…"
-      ]
+       "version_minor": 0,
+       "model_id": "019fde2343634e94b6f32d04f6350ec1"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "14bcfd9a2c5a47e6b1383989ab7632c8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Text(value='Why is Einstein famous for ?', description='Question:', placeholder='Enter something')"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 168.83it/s]\n",
-      "add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 1919.59it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'score': 0.40340670623875496, 'start': 27, 'end': 54, 'answer': 'general theory of relativity'}\n"
-     ]
     }
    ],
    "source": [

From 7ac47bfe69f25fc7381be65870b2f4e5cdb8cb6a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Thu, 5 Mar 2020 16:07:43 +0100
Subject: [PATCH 73/80] Updated notebook dependencies for Colab.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
---
 notebooks/02-transformers.ipynb | 185 +++++++++++++++++++++++---------
 1 file changed, 132 insertions(+), 53 deletions(-)

diff --git a/notebooks/02-transformers.ipynb b/notebooks/02-transformers.ipynb
index e02d19c5a6..40ba7bc2bc 100644
--- a/notebooks/02-transformers.ipynb
+++ b/notebooks/02-transformers.ipynb
@@ -77,21 +77,93 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [
-    "!pip install transformers"
-   ],
+   "execution_count": 1,
    "metadata": {
-    "collapsed": false,
     "pycharm": {
+     "is_executing": false,
      "name": "#%% code\n"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: transformers in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (2.5.1)\n",
+      "Requirement already satisfied: filelock in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (3.0.12)\n",
+      "Requirement already satisfied: sentencepiece in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (0.1.83)\n",
+      "Requirement already satisfied: boto3 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (1.12.0)\n",
+      "Requirement already satisfied: requests in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (2.22.0)\n",
+      "Requirement already satisfied: numpy in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (1.18.1)\n",
+      "Requirement already satisfied: sacremoses in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (0.0.35)\n",
+      "Requirement already satisfied: tokenizers==0.5.2 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (0.5.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (2020.1.8)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from transformers) (4.42.1)\n",
+      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from boto3->transformers) (0.3.3)\n",
+      "Requirement already satisfied: botocore<1.16.0,>=1.15.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from boto3->transformers) (1.15.0)\n",
+      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from boto3->transformers) (0.9.4)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests->transformers) (2019.11.28)\n",
+      "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests->transformers) (2.8)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests->transformers) (1.25.8)\n",
+      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests->transformers) (3.0.4)\n",
+      "Requirement already satisfied: joblib in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from sacremoses->transformers) (0.14.0)\n",
+      "Requirement already satisfied: click in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from sacremoses->transformers) (7.0)\n",
+      "Requirement already satisfied: six in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from sacremoses->transformers) (1.14.0)\n",
+      "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from botocore<1.16.0,>=1.15.0->boto3->transformers) (0.15.2)\n",
+      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from botocore<1.16.0,>=1.15.0->boto3->transformers) (2.8.1)\n",
+      "Requirement already satisfied: tensorflow==2.1.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (2.1.0)\n",
+      "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.1.0)\n",
+      "Requirement already satisfied: keras-preprocessing>=1.1.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.1.0)\n",
+      "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (3.1.0)\n",
+      "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (3.11.4)\n",
+      "Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.18.1)\n",
+      "Requirement already satisfied: tensorboard<2.2.0,>=2.1.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (2.1.0)\n",
+      "Requirement already satisfied: keras-applications>=1.0.8 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.0.8)\n",
+      "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.11.2)\n",
+      "Requirement already satisfied: six>=1.12.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.14.0)\n",
+      "Requirement already satisfied: tensorflow-estimator<2.2.0,>=2.1.0rc0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (2.1.0)\n",
+      "Requirement already satisfied: scipy==1.4.1; python_version >= \"3\" in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.4.1)\n",
+      "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (0.1.8)\n",
+      "Requirement already satisfied: wheel>=0.26; python_version >= \"3\" in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (0.34.2)\n",
+      "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (1.16.1)\n",
+      "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (0.9.0)\n",
+      "Requirement already satisfied: gast==0.2.2 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (0.2.2)\n",
+      "Requirement already satisfied: astor>=0.6.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorflow==2.1.0) (0.8.0)\n",
+      "Requirement already satisfied: setuptools in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from protobuf>=3.8.0->tensorflow==2.1.0) (45.2.0.post20200210)\n",
+      "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (1.11.2)\n",
+      "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (0.4.1)\n",
+      "Requirement already satisfied: markdown>=2.6.8 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (3.1.1)\n",
+      "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (1.0.0)\n",
+      "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (2.22.0)\n",
+      "Requirement already satisfied: h5py in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from keras-applications>=1.0.8->tensorflow==2.1.0) (2.10.0)\n",
+      "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (4.0)\n",
+      "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (4.0.0)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (0.2.8)\n",
+      "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (1.3.0)\n",
+      "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (2.8)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (2019.11.28)\n",
+      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (3.0.4)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (1.25.8)\r\n",
+      "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from rsa<4.1,>=3.1.4->google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (0.4.8)\r\n",
+      "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/Caskroom/miniconda/base/envs/huggingface/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.2.0,>=2.1.0->tensorflow==2.1.0) (3.1.0)\r\n"
+     ]
     }
-   }
+   ],
+   "source": [
+    "!pip install transformers\n",
+    "!pip install tensorflow==2.1.0"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 2,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -102,10 +174,10 @@
     {
      "data": {
       "text/plain": [
-       "<torch.autograd.grad_mode.set_grad_enabled at 0x1af62fd450>"
+       "<torch.autograd.grad_mode.set_grad_enabled at 0x102c0ce10>"
       ]
      },
-     "execution_count": 74,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -119,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 3,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -133,7 +205,7 @@
     "\n",
     "# We need to create the model and tokenizer\n",
     "model = AutoModel.from_pretrained(MODEL_NAME)\n",
-    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) "
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)"
    ]
   },
   {
@@ -151,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 4,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -163,10 +235,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Tokens: ['[CLS]', 'This', 'is', 'an', 'input', 'example', '[SEP]']\n",
-      "Tokens id: [101, 1188, 1110, 1126, 7758, 1859, 102]\n",
+      "Tokens: ['This', 'is', 'an', 'input', 'example']\n",
+      "Tokens id: [1188, 1110, 1126, 7758, 1859]\n",
       "Tokens PyTorch: tensor([[ 101, 1188, 1110, 1126, 7758, 1859,  102]])\n",
-      "Tokenwise output: torch.Size([1, 7, 768]), Pooled output: torch.Size([1, 768])\n"
+      "Tokenw   ise output: torch.Size([1, 7, 768]), Pooled output: torch.Size([1, 768])\n"
      ]
     }
    ],
@@ -180,13 +252,16 @@
     "tokens_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
     "print(\"Tokens id: {}\".format(tokens_ids))\n",
     "\n",
+    "# Add the required special tokens\n",
+    "tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)\n",
+    "\n",
     "# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.\n",
     "tokens_pt = torch.tensor([tokens_ids])\n",
     "print(\"Tokens PyTorch: {}\".format(tokens_pt))\n",
     "\n",
     "# Now we're ready to go through BERT with out input\n",
     "outputs, pooled = model(tokens_pt)\n",
-    "print(\"Tokenwise output: {}, Pooled output: {}\".format(outputs.shape, pooled.shape))"
+    "print(\"Tokenw   ise output: {}, Pooled output: {}\".format(outputs.shape, pooled.shape))"
    ]
   },
   {
@@ -223,7 +298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 5,
    "metadata": {
     "pycharm": {
      "is_executing": false,
@@ -275,8 +350,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -312,8 +391,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
+   "execution_count": 7,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -358,8 +441,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
    "outputs": [],
    "source": [
     "from transformers import TFBertModel, BertModel\n",
@@ -371,18 +458,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "output differences: 2.971128560602665e-05\n",
-      "pooled differences: -8.576549589633942e-06\n"
-     ]
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
     }
-   ],
+   },
+   "outputs": [],
    "source": [
     "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
     "input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
@@ -422,20 +504,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 57.1 ms, sys: 2.44 ms, total: 59.5 ms\n",
-      "Wall time: 35.5 ms\n",
-      "CPU times: user 98.8 ms, sys: 725 µs, total: 99.5 ms\n",
-      "Wall time: 50 ms\n"
-     ]
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
     }
-   ],
+   },
+   "outputs": [],
    "source": [
     "from transformers import DistilBertModel\n",
     "\n",
@@ -466,8 +541,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
    "outputs": [],
    "source": [
     "# Let's load German BERT from the Bavarian State Library\n",
@@ -503,13 +582,13 @@
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}

From c36fdc88d48c4b82cd3885083f738ddf1e69a782 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 5 Mar 2020 12:33:08 -0500
Subject: [PATCH 74/80] tests pass

---
 .../summarization/bertabs/modeling_bertabs.py   |  7 +++++--
 src/transformers/modeling_bart.py               | 11 ++++++++---
 tests/test_modeling_bart.py                     | 17 ++++++++++++-----
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/examples/summarization/bertabs/modeling_bertabs.py b/examples/summarization/bertabs/modeling_bertabs.py
index 0691403186..e314ff122b 100644
--- a/examples/summarization/bertabs/modeling_bertabs.py
+++ b/examples/summarization/bertabs/modeling_bertabs.py
@@ -15,11 +15,14 @@
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+
+
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import copy
+
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 import math
 
 import numpy as np
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 21c51f971e..d689756f69 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -640,9 +640,10 @@ class SelfAttention(nn.Module):
             reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool)
             attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)
-        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_weights_float = F.softmax(attn_weights, dim=-1)
         attn_probs = F.dropout(attn_weights_float, p=self.dropout, training=self.training,)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+
         assert v is not None
         attn_output = torch.bmm(attn_probs, v)
         assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
@@ -696,8 +697,12 @@ class SelfAttention(nn.Module):
         elif prev_key_padding_mask is not None:
             filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1))
             if prev_key_padding_mask.is_cuda:
-                filler = filler.cuda()
+                filler = filler.to(prev_key_padding_mask.device)
             new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), filler.float()], dim=1)
+            print(new_key_padding_mask.device, new_key_padding_mask.dtype)
+            import ipdb
+
+            ipdb.set_trace()
         elif key_padding_mask is not None:
             filler = torch.zeros(batch_size, src_len - key_padding_mask.size(1))
             if key_padding_mask.is_cuda:
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 559046f66b..ccb1946080 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -243,15 +243,15 @@ class BartHeadTests(unittest.TestCase):
             decoder_ffn_dim=32,
             max_position_embeddings=48,
         )
-        lm_model = BartForMaskedLM(config)
-        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long()
-        summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long()
+        lm_model = BartForMaskedLM(config).to(torch_device)
+        context = _long_tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]])
+        summary = _long_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]])
         logits, enc_features = lm_model.forward(input_ids=context, decoder_input_ids=summary)
         expected_shape = (*summary.shape, config.vocab_size)
         self.assertEqual(logits.shape, expected_shape)
 
     def test_generate_beam_search(self):
-        input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long()
+        input_ids = _long_tensor([[71, 82, 2], [68, 34, 2]])
         config = BartConfig(
             vocab_size=self.vocab_size,
             d_model=24,
@@ -264,7 +264,7 @@ class BartHeadTests(unittest.TestCase):
             max_position_embeddings=48,
             output_past=True,
         )
-        lm_model = BartForMaskedLM(config)
+        lm_model = BartForMaskedLM(config).to(torch_device)
         lm_model.eval()
 
         new_input_ids = lm_model.generate(
@@ -294,6 +294,13 @@ class BartHeadTests(unittest.TestCase):
             bart_toks = tokenizer.encode(ex, return_tensors="pt")
             _assert_tensors_equal(desired_result.long(), bart_toks, prefix=ex)
 
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_generate_fp16(self):
+        config, input_ids, batch_size = self._get_config_and_data(output_past=True)
+        attention_mask = input_ids.ne(1)
+        lm_model = BartForMaskedLM(config).eval().to(torch_device).half()
+        lm_model.generate(input_ids, attention_mask)
+
 
 def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
     """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""

From c203509d5b0f002a7833382a03ffe7802aa14e91 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 5 Mar 2020 12:34:08 -0500
Subject: [PATCH 75/80] undo chg

---
 examples/summarization/bertabs/modeling_bertabs.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/summarization/bertabs/modeling_bertabs.py b/examples/summarization/bertabs/modeling_bertabs.py
index e314ff122b..0691403186 100644
--- a/examples/summarization/bertabs/modeling_bertabs.py
+++ b/examples/summarization/bertabs/modeling_bertabs.py
@@ -15,14 +15,11 @@
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-
-
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import copy
-
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 import math
 
 import numpy as np

From 810079de1f7517489dda8992859048a1c1004261 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 5 Mar 2020 12:48:14 -0500
Subject: [PATCH 76/80] no ipdb

---
 src/transformers/modeling_bart.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index d689756f69..a52196ce99 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -688,6 +688,7 @@ class SelfAttention(nn.Module):
         static_kv: bool,
     ) -> Optional[Tensor]:
         # saved key padding masks have shape (bsz, seq_len)
+
         if prev_key_padding_mask is not None and static_kv:
             new_key_padding_mask = prev_key_padding_mask
         elif prev_key_padding_mask is not None and key_padding_mask is not None:
@@ -699,10 +700,6 @@ class SelfAttention(nn.Module):
             if prev_key_padding_mask.is_cuda:
                 filler = filler.to(prev_key_padding_mask.device)
             new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), filler.float()], dim=1)
-            print(new_key_padding_mask.device, new_key_padding_mask.dtype)
-            import ipdb
-
-            ipdb.set_trace()
         elif key_padding_mask is not None:
             filler = torch.zeros(batch_size, src_len - key_padding_mask.size(1))
             if key_padding_mask.is_cuda:

From 1360dacaa3a30cca05635ef095ec339525bdf3f0 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 5 Mar 2020 12:57:42 -0500
Subject: [PATCH 77/80] cleanup deltas

---
 src/transformers/modeling_bart.py |  5 ++---
 tests/test_modeling_bart.py       | 10 +++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index a52196ce99..a851d649be 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -640,9 +640,8 @@ class SelfAttention(nn.Module):
             reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool)
             attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights_float = F.softmax(attn_weights, dim=-1)
-        attn_probs = F.dropout(attn_weights_float, p=self.dropout, training=self.training,)
-        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
 
         assert v is not None
         attn_output = torch.bmm(attn_probs, v)
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index ccb1946080..f588d445b2 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -243,15 +243,15 @@ class BartHeadTests(unittest.TestCase):
             decoder_ffn_dim=32,
             max_position_embeddings=48,
         )
-        lm_model = BartForMaskedLM(config).to(torch_device)
-        context = _long_tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]])
-        summary = _long_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]])
+        lm_model = BartForMaskedLM(config)
+        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long()
+        summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long()
         logits, enc_features = lm_model.forward(input_ids=context, decoder_input_ids=summary)
         expected_shape = (*summary.shape, config.vocab_size)
         self.assertEqual(logits.shape, expected_shape)
 
     def test_generate_beam_search(self):
-        input_ids = _long_tensor([[71, 82, 2], [68, 34, 2]])
+        input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long()
         config = BartConfig(
             vocab_size=self.vocab_size,
             d_model=24,
@@ -264,7 +264,7 @@ class BartHeadTests(unittest.TestCase):
             max_position_embeddings=48,
             output_past=True,
         )
-        lm_model = BartForMaskedLM(config).to(torch_device)
+        lm_model = BartForMaskedLM(config)
         lm_model.eval()
 
         new_input_ids = lm_model.generate(

From 14d40584b2df8bfd6a90d835e086412fef38cceb Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 5 Mar 2020 13:06:35 -0500
Subject: [PATCH 78/80] remove newline

---
 src/transformers/modeling_bart.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index a851d649be..3e5a27b9b9 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -687,7 +687,6 @@ class SelfAttention(nn.Module):
         static_kv: bool,
     ) -> Optional[Tensor]:
         # saved key padding masks have shape (bsz, seq_len)
-
         if prev_key_padding_mask is not None and static_kv:
             new_key_padding_mask = prev_key_padding_mask
         elif prev_key_padding_mask is not None and key_padding_mask is not None:

From 0001d056861bb1ec7bd6a825006f578629a101fc Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Thu, 5 Mar 2020 17:01:54 -0500
Subject: [PATCH 79/80] Correct missing keys + test (#3143)

---
 src/transformers/modeling_utils.py |  9 +++++++++
 tests/test_modeling_common.py      | 15 +++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7dd0e873dc..203d5e8057 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -539,6 +539,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 model_to_load = getattr(model, cls.base_model_prefix)
 
             load(model_to_load, prefix=start_prefix)
+
+            if model.__class__.__name__ != model_to_load.__class__.__name__:
+                base_model_state_dict = model_to_load.state_dict().keys()
+                head_model_state_dict_without_base_prefix = [
+                    key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
+                ]
+
+                missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
+
             if len(missing_keys) > 0:
                 logger.info(
                     "Weights of {} not initialized from pretrained model: {}".format(
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 9ba00d2421..a52d746947 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -526,6 +526,21 @@ class ModelTesterMixin:
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
 
+    def test_correct_missing_keys(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            base_model_prefix = model.base_model_prefix
+
+            if hasattr(model, base_model_prefix):
+                with tempfile.TemporaryDirectory() as temp_dir_name:
+                    model.base_model.save_pretrained(temp_dir_name)
+                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
+
+                    with self.subTest(msg="Missing keys for {}".format(model.__class__.__name__)):
+                        self.assertGreater(len(loading_info["missing_keys"]), 0)
+
     def test_tie_model_weights(self):
         if not self.test_torchscript:
             return

From b623ddc0002aebe32e2b7a1203a6acbed61bf9a8 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Thu, 5 Mar 2020 17:16:57 -0500
Subject: [PATCH 80/80] Pass kwargs to configuration (#3147)

* Pass kwargs to configuration

* Setter

* test
---
 src/transformers/configuration_utils.py | 12 ++++++++++++
 tests/test_configuration_common.py      | 10 ++++++++++
 2 files changed, 22 insertions(+)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index d8cd0fe3e9..5ce23e2c88 100644
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -98,6 +98,18 @@ class PretrainedConfig(object):
                 logger.error("Can't set {} with value {} for {}".format(key, value, self))
                 raise err
 
+    @property
+    def num_labels(self):
+        return self._num_labels
+
+    @num_labels.setter
+    def num_labels(self, num_labels):
+        self._num_labels = num_labels
+        self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)}
+        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
     def save_pretrained(self, save_directory):
         """
         Save a configuration object to the directory `save_directory`, so that it
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 471f0f012d..7498ae6caf 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -57,8 +57,18 @@ class ConfigTester(object):
 
         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    def create_and_test_config_with_num_labels(self):
+        config = self.config_class(**self.inputs_dict, num_labels=5)
+        self.parent.assertEqual(len(config.id2label), 5)
+        self.parent.assertEqual(len(config.label2id), 5)
+
+        config.num_labels = 3
+        self.parent.assertEqual(len(config.id2label), 3)
+        self.parent.assertEqual(len(config.label2id), 3)
+
     def run_common_tests(self):
         self.create_and_test_config_common_properties()
         self.create_and_test_config_to_json_string()
         self.create_and_test_config_to_json_file()
         self.create_and_test_config_from_and_save_pretrained()
+        self.create_and_test_config_with_num_labels()