From ea52f82455a7ca0f979768204dfeb38b5fff13ad Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 18 Nov 2019 14:42:59 -0500
Subject: [PATCH 01/91] Moved some SQuAD logic to /data

---
 transformers/__init__.py                 |   3 +-
 transformers/data/__init__.py            |   3 +-
 transformers/data/processors/__init__.py |   1 +
 transformers/data/processors/squad.py    | 318 +++++++++++++++++++++++
 4 files changed, 323 insertions(+), 2 deletions(-)
 create mode 100644 transformers/data/processors/squad.py

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 5c7b0a6197..b859e18c53 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -25,7 +25,8 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
 from .data import (is_sklearn_available,
                    InputExample, InputFeatures, DataProcessor,
                    glue_output_modes, glue_convert_examples_to_features,
-                   glue_processors, glue_tasks_num_labels)
+                   glue_processors, glue_tasks_num_labels,
+                   squad_convert_examples_to_features, SquadFeatures)
 
 if is_sklearn_available():
     from .data import glue_compute_metrics
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index e910d6da2e..827d96ed29 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,5 +1,6 @@
-from .processors import InputExample, InputFeatures, DataProcessor
+from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+from .processors import squad_convert_examples_to_features
 
 from .metrics import is_sklearn_available
 if is_sklearn_available():
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index af38c54beb..4e322a2ca8 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,3 +1,4 @@
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+from .squad import squad_convert_examples_to_features, SquadFeatures
 
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
new file mode 100644
index 0000000000..c1a1034f17
--- /dev/null
+++ b/transformers/data/processors/squad.py
@@ -0,0 +1,318 @@
+from tqdm import tqdm
+import collections
+import logging
+import os
+
+from .utils import DataProcessor, InputExample, InputFeatures
+from ...file_utils import is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.getLogger(__name__)
+
+def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                       doc_stride, max_query_length, is_training,
+                                       cls_token_at_end=False,
+                                       cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                       sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                       cls_token_segment_id=0, pad_token_segment_id=0,
+                                       mask_padding_with_zero=True,
+                                       sequence_a_is_doc=False):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Defining helper methods
+    def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+        """Returns tokenized answer spans that better match the annotated answer."""
+        tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+        for new_start in range(input_start, input_end + 1):
+            for new_end in range(input_end, new_start - 1, -1):
+                text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+                if text_span == tok_answer_text:
+                    return (new_start, new_end)
+
+        return (input_start, input_end)
+    def _check_is_max_context(doc_spans, cur_span_index, position):
+        """Check if this is the 'max context' doc span for the token."""
+        best_score = None
+        best_span_index = None
+        for (span_index, doc_span) in enumerate(doc_spans):
+            end = doc_span.start + doc_span.length - 1
+            if position < doc_span.start:
+                continue
+            if position > end:
+                continue
+            num_left_context = position - doc_span.start
+            num_right_context = end - position
+            score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+            if best_score is None or score > best_score:
+                best_score = score
+                best_span_index = span_index
+
+        return cur_span_index == best_span_index
+    
+    unique_id = 1000000000
+
+    features = []
+    for (example_index, example) in enumerate(tqdm(examples)):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = []
+
+            # CLS token at the beginning
+            if not cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = 0
+
+            # XLNet: P SEP Q SEP CLS
+            # Others: CLS Q SEP P SEP
+            if not sequence_a_is_doc:
+                # Query
+                tokens += query_tokens
+                segment_ids += [sequence_a_segment_id] * len(query_tokens)
+                p_mask += [1] * len(query_tokens)
+
+                # SEP token
+                tokens.append(sep_token)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)
+
+            # Paragraph
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                if not sequence_a_is_doc:
+                    segment_ids.append(sequence_b_segment_id)
+                else:
+                    segment_ids.append(sequence_a_segment_id)
+                p_mask.append(0)
+            paragraph_len = doc_span.length
+
+            if sequence_a_is_doc:
+                # SEP token
+                tokens.append(sep_token)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)
+
+                tokens += query_tokens
+                segment_ids += [sequence_b_segment_id] * len(query_tokens)
+                p_mask += [1] * len(query_tokens)
+
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_b_segment_id)
+            p_mask.append(1)
+
+            # CLS token at the end
+            if cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = len(tokens) - 1  # Index of classification token
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(pad_token)
+                input_mask.append(0 if mask_padding_with_zero else 1)
+                segment_ids.append(pad_token_segment_id)
+                p_mask.append(1)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            span_is_impossible = example.is_impossible
+            start_position = None
+            end_position = None
+            if is_training and not span_is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                    span_is_impossible = True
+                else:
+                    if sequence_a_is_doc:
+                        doc_offset = 0
+                    else:
+                        doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            if is_training and span_is_impossible:
+                start_position = cls_index
+                end_position = cls_index
+
+            if example_index < 20:
+                logger.info("*** Example ***")
+                logger.info("unique_id: %s" % (unique_id))
+                logger.info("example_index: %s" % (example_index))
+                logger.info("doc_span_index: %s" % (doc_span_index))
+                logger.info("tokens: %s" % " ".join(tokens))
+                logger.info("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
+                logger.info("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
+                ]))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info(
+                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                if is_training and span_is_impossible:
+                    logger.info("impossible example")
+                if is_training and not span_is_impossible:
+                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    logger.info("start_position: %d" % (start_position))
+                    logger.info("end_position: %d" % (end_position))
+                    logger.info(
+                        "answer: %s" % (answer_text))
+
+            features.append(
+                SquadFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    cls_index=cls_index,
+                    p_mask=p_mask,
+                    paragraph_len=paragraph_len,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=span_is_impossible))
+            unique_id += 1
+
+    return features
+
+class SquadFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 cls_index,
+                 p_mask,
+                 paragraph_len,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+        self.paragraph_len = paragraph_len
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __eq__(self, other):
+        return self.cls_index == other.cls_index and \
+                self.doc_span_index == other.doc_span_index and \
+                self.end_position == other.end_position and \
+                self.example_index == other.example_index and \
+                self.input_ids == other.input_ids and \
+                self.input_mask == other.input_mask and \
+                self.is_impossible == other.is_impossible and \
+                self.p_mask == other.p_mask and \
+                self.paragraph_len == other.paragraph_len and \
+                self.segment_ids == other.segment_ids and \
+                self.start_position == other.start_position and \
+                self.token_is_max_context == other.token_is_max_context and \
+                self.token_to_orig_map == other.token_to_orig_map and \
+                self.tokens == other.tokens and \
+                self.unique_id == other.unique_id
\ No newline at end of file

From 72e506b22e90feab6c410136bacc27f3d65284b9 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 19 Nov 2019 09:49:55 -0500
Subject: [PATCH 02/91] wip

---
 examples/run_squad.py                    |  29 +++++-
 transformers/__init__.py                 |   3 +-
 transformers/data/__init__.py            |   2 +-
 transformers/data/processors/__init__.py |   2 +-
 transformers/data/processors/squad.py    | 122 +++++++++++++++++++++++
 transformers/tokenization_utils.py       |   4 +
 6 files changed, 157 insertions(+), 5 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 69088d73c3..d4219c3096 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -23,7 +23,6 @@ import os
 import random
 import glob
 import timeit
-
 import numpy as np
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
@@ -45,7 +44,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                   XLNetTokenizer,
                                   DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 
-from transformers import AdamW, get_linear_schedule_with_warmup
+from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples
 
 from utils_squad import (read_squad_examples, convert_examples_to_features,
                          RawResult, write_predictions,
@@ -309,6 +308,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         examples = read_squad_examples(input_file=input_file,
                                                 is_training=not evaluate,
                                                 version_2_with_negative=args.version_2_with_negative)
+
+        examples = examples[:10]
         features = convert_examples_to_features(examples=examples,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=args.max_seq_length,
@@ -319,6 +320,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                                                 pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
                                                 cls_token_at_end=True if args.model_type in ['xlnet'] else False,
                                                 sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
+
+        exampless = sread_squad_examples(input_file=input_file,
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
+        exampless = exampless[:10]
+        features2 = squad_convert_examples_to_features(examples=exampless,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate,
+                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+                                                pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
+                                                cls_token_at_end=True if args.model_type in ['xlnet'] else False,
+                                                sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
+
+        print(features2)
+
+        for i in range(len(features)):
+            assert features[i] == features2[i]
+            print("Equal")
+
+        print("DONE")
+        
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index b859e18c53..9a767913b3 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -26,7 +26,8 @@ from .data import (is_sklearn_available,
                    InputExample, InputFeatures, DataProcessor,
                    glue_output_modes, glue_convert_examples_to_features,
                    glue_processors, glue_tasks_num_labels,
-                   squad_convert_examples_to_features, SquadFeatures)
+                   squad_convert_examples_to_features, SquadFeatures, 
+                   SquadExample, read_squad_examples)
 
 if is_sklearn_available():
     from .data import glue_compute_metrics
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index 827d96ed29..50f2e768f4 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,6 +1,6 @@
 from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .processors import squad_convert_examples_to_features
+from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples
 
 from .metrics import is_sklearn_available
 if is_sklearn_available():
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index 4e322a2ca8..924b4a1245 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .squad import squad_convert_examples_to_features, SquadFeatures
+from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples
 
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index c1a1034f17..1900e9f0ce 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -2,7 +2,9 @@ from tqdm import tqdm
 import collections
 import logging
 import os
+import json
 
+from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
 from .utils import DataProcessor, InputExample, InputFeatures
 from ...file_utils import is_tf_available
 
@@ -11,6 +13,7 @@ if is_tf_available():
 
 logger = logging.getLogger(__name__)
 
+
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                                        doc_stride, max_query_length, is_training,
                                        cls_token_at_end=False,
@@ -265,6 +268,125 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
     return features
 
+
+def read_squad_examples(input_file, is_training, version_2_with_negative):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r", encoding='utf-8') as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer.")
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.warning("Could not find answer: '%s' vs. '%s'",
+                                           actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+    return examples
+
+
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset.
+    For examples without an answer, the start and end position are -1.
+    """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (self.qas_id)
+        s += ", question_text: %s" % (
+            self.question_text)
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.end_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.is_impossible:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
 class SquadFeatures(object):
     """A single set of features of data."""
 
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 4fa26a26f8..ba10e6b311 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -605,6 +605,10 @@ class PreTrainedTokenizer(object):
             vocabularies (BPE/SentencePieces/WordPieces).
 
             Take care of added tokens.
+
+            text: The sequence to be encoded.
+            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
+            **kwargs: passed to the child `self.tokenize()` method
         """
         def split_on_token(tok, text):
             result = []

From 9f374c8252330bffd669c43749b5e937ed31d90a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 22 Nov 2019 16:27:15 -0500
Subject: [PATCH 03/91] `encode` and `encode_plus` handle attention masks and
 padding

---
 .../tests/tokenization_tests_commons.py       | 51 ++++++++++++
 transformers/tokenization_utils.py            | 77 ++++++++++++++++++-
 transformers/tokenization_xlnet.py            |  1 +
 3 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index fdaf8cc137..d5b70d5266 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -335,3 +335,54 @@ class CommonTestCases:
             special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
             self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
+
+        def test_padding_to_max_length(self):
+            tokenizer = self.get_tokenizer()
+
+            sequence = "Sequence"
+            padding_size = 10
+            padding_idx = tokenizer.pad_token_id
+
+            # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+            encoded_sequence = tokenizer.encode(sequence)
+            sequence_length = len(encoded_sequence)
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+            padded_sequence_length = len(padded_sequence)
+            assert sequence_length + padding_size == padded_sequence_length
+            assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+
+            # Check that nothing is done when a maximum length is not specified
+            encoded_sequence = tokenizer.encode(sequence)
+            sequence_length = len(encoded_sequence)
+            padded_sequence = tokenizer.encode(sequence, pad_to_max_length=True)
+            padded_sequence_length = len(padded_sequence)
+            assert sequence_length == padded_sequence_length
+            assert encoded_sequence == padded_sequence
+
+        def test_encode_plus_with_padding(self):
+            tokenizer = self.get_tokenizer()
+
+            sequence = "Sequence"
+            padding_size = 10
+            padding_idx = tokenizer.pad_token_id
+            token_type_padding_idx = tokenizer.pad_token_type_id
+
+            encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
+            input_ids = encoded_sequence['input_ids']
+            token_type_ids = encoded_sequence['token_type_ids']
+            attention_mask = encoded_sequence['attention_mask']
+            special_tokens_mask = encoded_sequence['special_tokens_mask']
+            sequence_length = len(input_ids)
+
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
+            padded_input_ids = padded_sequence['input_ids']
+            padded_token_type_ids = padded_sequence['token_type_ids']
+            padded_attention_mask = padded_sequence['attention_mask']
+            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
+            padded_sequence_length = len(padded_input_ids)
+
+            assert sequence_length + padding_size == padded_sequence_length
+            assert input_ids + [padding_idx] * padding_size == padded_input_ids
+            assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
+            assert attention_mask + [0] * padding_size == padded_attention_mask 
+            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
\ No newline at end of file
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index ba10e6b311..3214699e12 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -190,6 +190,11 @@ class PreTrainedTokenizer(object):
         """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
         return self.convert_tokens_to_ids(self.pad_token)
 
+    @property
+    def pad_token_type_id(self):
+        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+        return self._pad_token_type_id
+
     @property
     def cls_token_id(self):
         """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
@@ -213,6 +218,7 @@ class PreTrainedTokenizer(object):
         self._pad_token = None
         self._cls_token = None
         self._mask_token = None
+        self._pad_token_type_id = 0
         self._additional_special_tokens = []
 
         self.max_len = max_len if max_len is not None else int(1e12)
@@ -696,6 +702,7 @@ class PreTrainedTokenizer(object):
                max_length=None,
                stride=0,
                truncation_strategy='longest_first',
+               pad_to_max_length=False,
                return_tensors=None,
                **kwargs):
         """
@@ -722,6 +729,8 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's 
+                padding index, up to their max length. If no max length is specified, no padding is done.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -732,6 +741,7 @@ class PreTrainedTokenizer(object):
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
                                           truncation_strategy=truncation_strategy,
+                                          pad_to_max_length=pad_to_max_length,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -744,7 +754,12 @@ class PreTrainedTokenizer(object):
                     max_length=None,
                     stride=0,
                     truncation_strategy='longest_first',
+                    pad_to_max_length=False,
                     return_tensors=None,
+                    return_token_type_ids=True,
+                    return_attention_mask=True,
+                    return_overflowing_tokens=False,
+                    return_special_tokens_mask=False,
                     **kwargs):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
@@ -769,9 +784,37 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's 
+                padding index, up to their max length. If no max length is specified, no padding is done.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
+            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
             **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            A Dictionary of shape::
+
+                {
+                    input_ids: list[int],
+                    token_type_ids: list[int] if return_token_type_ids is True (default)
+                    attention_mask: list[int] if return_attention_mask is True (default)
+                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+                }
+
+            With the fields:
+                ``input_ids``: list of token ids to be fed to a model
+                ``token_type_ids``: list of token type ids to be fed to a model
+                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+
+                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+                tokens and 1 specifying sequence tokens.
         """
 
         def get_input_ids(text):
@@ -790,13 +833,24 @@ class PreTrainedTokenizer(object):
         return self.prepare_for_model(first_ids,
                                       pair_ids=second_ids,
                                       max_length=max_length,
+                                      pad_to_max_length=pad_to_max_length,
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncation_strategy=truncation_strategy,
-                                      return_tensors=return_tensors)
+                                      return_tensors=return_tensors,
+                                      return_attention_mask=return_attention_mask,
+                                      return_token_type_ids=return_token_type_ids,
+                                      return_overflowing_tokens=return_overflowing_tokens,
+                                      return_special_tokens_mask=return_special_tokens_mask)
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
-                          truncation_strategy='longest_first', return_tensors=None):
+                          truncation_strategy='longest_first',
+                          pad_to_max_length=False,
+                          return_tensors=None,
+                          return_token_type_ids=True,
+                          return_attention_mask=True,
+                          return_overflowing_tokens=False,
+                          return_special_tokens_mask=False):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -819,8 +873,14 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's 
+                padding index, up to their max length. If no max length is specified, no padding is done.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
+            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
 
         Return:
             A Dictionary of shape::
@@ -883,6 +943,19 @@ class PreTrainedTokenizer(object):
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
                            
+        if pad_to_max_length and max_length and len(encoded_inputs["input_ids"]) < max_length:
+            difference = max_length - len(encoded_inputs["input_ids"])
+            if return_attention_mask:
+                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
+            if return_token_type_ids:
+                encoded_inputs["token_type_ids"] += [self.pad_token_type_id] * difference
+            if return_special_tokens_mask:
+                encoded_inputs["special_tokens_mask"] += [1] * difference
+
+            encoded_inputs["input_ids"] += [self.pad_token_id] * difference
+        elif return_attention_mask:
+            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+
         return encoded_inputs
 
     def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index a4f1a6e3ba..3ea71f4438 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -74,6 +74,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+        self._pad_token_type_id = 3
 
         try:
             import sentencepiece as spm

From a7dafe2f41222469797f1a67232961d67bd2e519 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 21 Nov 2019 11:30:40 -0500
Subject: [PATCH 04/91] Padding strategy (left and right) rather than boolean
 flag

---
 .../tests/tokenization_tests_commons.py       | 43 +++++++++++---
 transformers/tokenization_utils.py            | 58 ++++++++++++++-----
 2 files changed, 77 insertions(+), 24 deletions(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index d5b70d5266..40d68d0ab2 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -343,21 +343,33 @@ class CommonTestCases:
             padding_size = 10
             padding_idx = tokenizer.pad_token_id
 
-            # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+            # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right')
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
 
-            # Check that nothing is done when a maximum length is not specified
+            # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, pad_to_max_length=True)
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left')
             padded_sequence_length = len(padded_sequence)
-            assert sequence_length == padded_sequence_length
-            assert encoded_sequence == padded_sequence
+            assert sequence_length + padding_size == padded_sequence_length
+            assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
+
+            # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
+            encoded_sequence = tokenizer.encode(sequence)
+            sequence_length = len(encoded_sequence)
+            padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right')
+            padded_sequence_right_length = len(padded_sequence_right)
+            padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left')
+            padded_sequence_left_length = len(padded_sequence_left)
+            assert sequence_length == padded_sequence_right_length
+            assert encoded_sequence == padded_sequence_right
+            assert sequence_length == padded_sequence_left_length
+            assert encoded_sequence == padded_sequence_left
 
         def test_encode_plus_with_padding(self):
             tokenizer = self.get_tokenizer()
@@ -374,7 +386,8 @@ class CommonTestCases:
             special_tokens_mask = encoded_sequence['special_tokens_mask']
             sequence_length = len(input_ids)
 
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
+            # Test right padding
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True)
             padded_input_ids = padded_sequence['input_ids']
             padded_token_type_ids = padded_sequence['token_type_ids']
             padded_attention_mask = padded_sequence['attention_mask']
@@ -385,4 +398,18 @@ class CommonTestCases:
             assert input_ids + [padding_idx] * padding_size == padded_input_ids
             assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
             assert attention_mask + [0] * padding_size == padded_attention_mask 
-            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
\ No newline at end of file
+            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
+
+            # Test left padding
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True)
+            padded_input_ids = padded_sequence['input_ids']
+            padded_token_type_ids = padded_sequence['token_type_ids']
+            padded_attention_mask = padded_sequence['attention_mask']
+            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
+            padded_sequence_length = len(padded_input_ids)
+
+            assert sequence_length + padding_size == padded_sequence_length
+            assert [padding_idx] * padding_size + input_ids == padded_input_ids
+            assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
+            assert [0] * padding_size + attention_mask == padded_attention_mask 
+            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask 
\ No newline at end of file
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 3214699e12..dbbabd0e1a 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -702,7 +702,7 @@ class PreTrainedTokenizer(object):
                max_length=None,
                stride=0,
                truncation_strategy='longest_first',
-               pad_to_max_length=False,
+               padding_strategy=None,
                return_tensors=None,
                **kwargs):
         """
@@ -729,8 +729,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's 
+            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
                 padding index, up to their max length. If no max length is specified, no padding is done.
+                The strategies are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences   
+                Defaults to None: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -741,7 +745,7 @@ class PreTrainedTokenizer(object):
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
                                           truncation_strategy=truncation_strategy,
-                                          pad_to_max_length=pad_to_max_length,
+                                          padding_strategy=padding_strategy,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -754,7 +758,7 @@ class PreTrainedTokenizer(object):
                     max_length=None,
                     stride=0,
                     truncation_strategy='longest_first',
-                    pad_to_max_length=False,
+                    padding_strategy=None,
                     return_tensors=None,
                     return_token_type_ids=True,
                     return_attention_mask=True,
@@ -784,8 +788,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's 
+            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
                 padding index, up to their max length. If no max length is specified, no padding is done.
+                The strategies are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences   
+                Defaults to None: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
@@ -833,7 +841,7 @@ class PreTrainedTokenizer(object):
         return self.prepare_for_model(first_ids,
                                       pair_ids=second_ids,
                                       max_length=max_length,
-                                      pad_to_max_length=pad_to_max_length,
+                                      padding_strategy=padding_strategy,
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncation_strategy=truncation_strategy,
@@ -845,7 +853,7 @@ class PreTrainedTokenizer(object):
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
                           truncation_strategy='longest_first',
-                          pad_to_max_length=False,
+                          padding_strategy=None,
                           return_tensors=None,
                           return_token_type_ids=True,
                           return_attention_mask=True,
@@ -873,8 +881,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's 
+            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
                 padding index, up to their max length. If no max length is specified, no padding is done.
+                The strategies are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences            
+                Defaults to None: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
@@ -943,16 +955,30 @@ class PreTrainedTokenizer(object):
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
                            
-        if pad_to_max_length and max_length and len(encoded_inputs["input_ids"]) < max_length:
+        if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length:
             difference = max_length - len(encoded_inputs["input_ids"])
-            if return_attention_mask:
-                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
-            if return_token_type_ids:
-                encoded_inputs["token_type_ids"] += [self.pad_token_type_id] * difference
-            if return_special_tokens_mask:
-                encoded_inputs["special_tokens_mask"] += [1] * difference
 
-            encoded_inputs["input_ids"] += [self.pad_token_id] * difference
+            if padding_strategy == 'right':
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+
+            elif padding_strategy == 'left':
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_strategy))
+            
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
 

From a5a8a6175fb5cc1e993366add026ba06386bde10 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 21 Nov 2019 19:18:20 -0500
Subject: [PATCH 05/91] Works for BERT

---
 transformers/data/processors/squad.py | 507 ++++++++++++++++++++++----
 1 file changed, 432 insertions(+), 75 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 1900e9f0ce..a0f2408a16 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -3,6 +3,7 @@ import collections
 import logging
 import os
 import json
+import numpy as np
 
 from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
 from .utils import DataProcessor, InputExample, InputFeatures
@@ -13,10 +14,68 @@ if is_tf_available():
 
 logger = logging.getLogger(__name__)
 
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                        orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+        # return True
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
 
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                                        doc_stride, max_query_length, is_training,
-                                       cls_token_at_end=False,
+                                       cls_token_at_end=True,
                                        cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                        sequence_a_segment_id=0, sequence_b_segment_id=1,
                                        cls_token_segment_id=0, pad_token_segment_id=0,
@@ -24,57 +83,184 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                                        sequence_a_is_doc=False):
     """Loads a data file into a list of `InputBatch`s."""
 
-    # Defining helper methods
-    def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-        """Returns tokenized answer spans that better match the annotated answer."""
-        tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-        for new_start in range(input_start, input_end + 1):
-            for new_end in range(input_end, new_start - 1, -1):
-                text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-                if text_span == tok_answer_text:
-                    return (new_start, new_end)
-
-        return (input_start, input_end)
-    def _check_is_max_context(doc_spans, cur_span_index, position):
-        """Check if this is the 'max context' doc span for the token."""
-        best_score = None
-        best_span_index = None
-        for (span_index, doc_span) in enumerate(doc_spans):
-            end = doc_span.start + doc_span.length - 1
-            if position < doc_span.start:
-                continue
-            if position > end:
-                continue
-            num_left_context = position - doc_span.start
-            num_right_context = end - position
-            score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-            if best_score is None or score > best_score:
-                best_score = score
-                best_span_index = span_index
-
-        return cur_span_index == best_span_index
-    
+    # Defining helper methods    
     unique_id = 1000000000
 
     features = []
+    new_features = []
     for (example_index, example) in enumerate(tqdm(examples)):
-        query_tokens = tokenizer.tokenize(example.question_text)
 
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in example.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        if is_training:
+            # Get start and end position
+            answer_length = len(example.answer_text)
+            start_position = char_to_word_offset[example.start_position]
+            end_position = char_to_word_offset[example.start_position + answer_length - 1]
+
+            # If the answer cannot be found in the text, then skip this example.
+            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+            cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+            if actual_text.find(cleaned_answer_text) == -1:
+                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+                continue
 
         tok_to_orig_index = []
         orig_to_tok_index = []
         all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
+        for (i, token) in enumerate(doc_tokens):
             orig_to_tok_index.append(len(all_doc_tokens))
             sub_tokens = tokenizer.tokenize(token)
             for sub_token in sub_tokens:
                 tok_to_orig_index.append(i)
                 all_doc_tokens.append(sub_token)
 
+        spans = []
+        
+        truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
+        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence 
+        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
+
+        encoded_dict = tokenizer.encode_plus(
+            truncated_query, 
+            all_doc_tokens, 
+            max_length=max_seq_length, 
+            padding_strategy='right',
+            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+            return_overflowing_tokens=True, 
+            truncation_strategy='only_second'
+        )
+
+        ids = encoded_dict['input_ids']
+        print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None)
+        non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids
+        paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+        token_to_orig_map = {}
+        for i in range(paragraph_len):
+            token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[0 + i]
+
+        encoded_dict["paragraph_len"] = paragraph_len
+        encoded_dict["tokens"] = tokens
+        encoded_dict["token_to_orig_map"] = token_to_orig_map
+        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+        encoded_dict["token_is_max_context"] = {}
+        encoded_dict["start"] = 0
+        encoded_dict["length"] = paragraph_len
+
+        spans.append(encoded_dict)
+        print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict)
+        while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict:
+            
+            overflowing_tokens = encoded_dict['overflowing_tokens']
+
+            print("OVERFLOW", len(overflowing_tokens))
+
+            encoded_dict = tokenizer.encode_plus(
+                truncated_query, 
+                overflowing_tokens, 
+                max_length=max_seq_length, 
+                return_overflowing_tokens=True, 
+                padding_strategy='right',
+                stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+                truncation_strategy='only_second'
+            )
+
+            ids = encoded_dict['input_ids']
+            print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None)
+
+            # Length of the document without the query
+            paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+
+            non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+            token_to_orig_map = {}
+            for i in range(paragraph_len):
+                token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+            encoded_dict["paragraph_len"] = paragraph_len
+            encoded_dict["tokens"] = tokens
+            encoded_dict["token_to_orig_map"] = token_to_orig_map
+            encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+            encoded_dict["token_is_max_context"] = {}
+            encoded_dict["start"] = len(spans) * doc_stride
+            encoded_dict["length"] = paragraph_len
+
+            # split_token_index = doc_span.start + i
+            # token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+            # is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+            #                                         split_token_index)
+            # token_is_max_context[len(tokens)] = is_max_context
+            # tokens.append(all_doc_tokens[split_token_index])
+
+            spans.append(encoded_dict)
+
+        for doc_span_index in range(len(spans)):
+            for j in range(spans[doc_span_index]["paragraph_len"]):
+                is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+                index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+        print("new span", len(spans))
+        for span in spans:
+            # Identify the position of the CLS token
+            cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = np.array(span['token_type_ids'])
+
+            # Convert all SEP indices to '0' before inversion
+            p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 0
+
+            # Limit positive values to one
+            p_mask = 1 - np.minimum(p_mask, 1)
+
+            # Set the CLS index to '0'
+            p_mask[cls_index] = 0
+
+            print("new features length", len(new_features))
+
+            new_features.append(NewSquadFeatures(
+                span['input_ids'],
+                span['attention_mask'],
+                span['token_type_ids'],
+                cls_index,
+                p_mask.tolist(),
+
+                example_index=example_index,
+                unique_id=unique_id,
+                paragraph_len=span['paragraph_len'],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"]
+            ))
+
+            unique_id += 1
+
+        # tokenize ...
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
         tok_start_position = None
         tok_end_position = None
         if is_training and example.is_impossible:
@@ -82,7 +268,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             tok_end_position = -1
         if is_training and not example.is_impossible:
             tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
+            if example.end_position < len(doc_tokens) - 1:
                 tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
             else:
                 tok_end_position = len(all_doc_tokens) - 1
@@ -101,14 +287,19 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         doc_spans = []
         start_offset = 0
         while start_offset < len(all_doc_tokens):
+            print("OLD DOC CREATION BEGIN", start_offset, len(all_doc_tokens))
             length = len(all_doc_tokens) - start_offset
             if length > max_tokens_for_doc:
                 length = max_tokens_for_doc
             doc_spans.append(_DocSpan(start=start_offset, length=length))
             if start_offset + length == len(all_doc_tokens):
+                print("Done with this doc span, breaking out.", start_offset, length)
                 break
+            print("CHOOSING OFFSET", length, doc_stride)
             start_offset += min(length, doc_stride)
+            print("OLD DOC CREATION END", start_offset)
 
+        print("old span", len(doc_spans))
         for (doc_span_index, doc_span) in enumerate(doc_spans):
             tokens = []
             token_to_orig_map = {}
@@ -183,18 +374,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             # tokens are attended to.
             input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 
+
+            
             # Zero-pad up to the sequence length.
             while len(input_ids) < max_seq_length:
                 input_ids.append(pad_token)
                 input_mask.append(0 if mask_padding_with_zero else 1)
                 segment_ids.append(pad_token_segment_id)
                 p_mask.append(1)
-
+            print("[OLD] Ids computed; position of the first padding", input_ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in input_ids else None)
             assert len(input_ids) == max_seq_length
             assert len(input_mask) == max_seq_length
             assert len(segment_ids) == max_seq_length
 
-            span_is_impossible = example.is_impossible
+            span_is_impossible = example.is_impossible if hasattr(example, "is_impossible") else False
             start_position = None
             end_position = None
             if is_training and not span_is_impossible:
@@ -222,31 +415,32 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 start_position = cls_index
                 end_position = cls_index
 
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and span_is_impossible:
-                    logger.info("impossible example")
-                if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
+            # if example_index < 20:
+            #     logger.info("*** Example ***")
+            #     logger.info("unique_id: %s" % (unique_id))
+            #     logger.info("example_index: %s" % (example_index))
+            #     logger.info("doc_span_index: %s" % (doc_span_index))
+            #     logger.info("tokens: %s" % str(tokens))
+            #     logger.info("token_to_orig_map: %s" % " ".join([
+            #         "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
+            #     logger.info("token_is_max_context: %s" % " ".join([
+            #         "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
+            #     ]))
+            #     logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            #     logger.info(
+            #         "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            #     logger.info(
+            #         "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            #     if is_training and span_is_impossible:
+            #         logger.info("impossible example")
+            #     if is_training and not span_is_impossible:
+            #         answer_text = " ".join(tokens[start_position:(end_position + 1)])
+            #         logger.info("start_position: %d" % (start_position))
+            #         logger.info("end_position: %d" % (end_position))
+            #         logger.info(
+            #             "answer: %s" % (answer_text))
 
+            print("features length", len(features))
             features.append(
                 SquadFeatures(
                     unique_id=unique_id,
@@ -266,7 +460,48 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                     is_impossible=span_is_impossible))
             unique_id += 1
 
-    return features
+        assert len(features) == len(new_features)
+
+    assert len(features) == len(new_features)
+    for i in range(len(features)):
+        print(i)
+        feature, new_feature = features[i], new_features[i]
+        
+        input_ids = feature.input_ids
+        input_mask = feature.input_mask
+        segment_ids = feature.segment_ids
+        cls_index = feature.cls_index
+        p_mask = feature.p_mask
+        example_index = feature.example_index
+        paragraph_len = feature.paragraph_len
+        token_is_max_context = feature.token_is_max_context
+        tokens = feature.tokens
+        token_to_orig_map = feature.token_to_orig_map
+              
+        new_input_ids = new_feature.input_ids
+        new_input_mask = new_feature.attention_mask
+        new_segment_ids = new_feature.token_type_ids
+        new_cls_index = new_feature.cls_index
+        new_p_mask = new_feature.p_mask
+        new_example_index = new_feature.example_index
+        new_paragraph_len = new_feature.paragraph_len
+        new_token_is_max_context = new_feature.token_is_max_context
+        new_tokens = new_feature.tokens
+        new_token_to_orig_map = new_feature.token_to_orig_map
+
+        assert input_ids == new_input_ids
+        assert input_mask == new_input_mask
+        assert segment_ids == new_segment_ids
+        assert cls_index == new_cls_index
+        assert p_mask == new_p_mask
+        assert example_index == new_example_index
+        assert paragraph_len == new_paragraph_len
+        assert token_is_max_context == new_token_is_max_context
+        assert tokens == new_tokens
+        assert token_to_orig_map == new_token_to_orig_map
+
+
+    return new_features
 
 
 def read_squad_examples(input_file, is_training, version_2_with_negative):
@@ -347,6 +582,124 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
     return examples
 
 
+class SquadV1Processor(DataProcessor):
+    """Processor for the SQuAD data set."""
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return NewSquadExample(
+            tensor_dict['id'].numpy(),
+            tensor_dict['question'].numpy().decode('utf-8'),
+            tensor_dict['context'].numpy().decode('utf-8'),
+            tensor_dict['answers']['text'].numpy().decode('utf-8'),
+            tensor_dict['answers']['answers_start'].numpy().decode('utf-8'),
+            tensor_dict['title'].numpy().decode('utf-8')
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, input_data, set_type):
+        """Creates examples for the training and dev sets."""
+        
+        is_training = set_type == "train"
+        examples = []
+        for entry in input_data:
+            title = entry['title']
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position = None
+                    answer_text = None
+                    if is_training:
+                        if (len(qa["answers"]) != 1):
+                            raise ValueError(
+                                "For training, each question should have exactly 1 answer.")
+                        answer = qa["answers"][0]
+                        answer_text = answer['text']
+                        start_position = answer['answer_start']
+
+                    example = NewSquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position=start_position,
+                        title=title
+                    )
+                    examples.append(example)
+        return examples
+        
+
+
+class NewSquadExample(object):
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+    """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 context_text,
+                 answer_text,
+                 start_position,
+                 title):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.start_position = start_position
+        self.title = title
+
+
+class NewSquadFeatures(object):
+    """
+    Single squad example features to be fed to a model.
+    Those features are model-specific.
+    """
+
+    def __init__(self,
+                 input_ids,
+                 attention_mask,
+                 token_type_ids,
+                 cls_index,
+                 p_mask,
+                 
+                 example_index,
+                 unique_id,
+                 paragraph_len,
+                 token_is_max_context,
+                 tokens,
+                 token_to_orig_map
+        ):
+        self.input_ids = input_ids 
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
 class SquadExample(object):
     """
     A single training/test example for the Squad dataset.
@@ -423,18 +776,22 @@ class SquadFeatures(object):
         self.is_impossible = is_impossible
 
     def __eq__(self, other):
-        return self.cls_index == other.cls_index and \
-                self.doc_span_index == other.doc_span_index and \
-                self.end_position == other.end_position and \
-                self.example_index == other.example_index and \
+        print(self.example_index == other.example_index)
+        print(self.input_ids == other.input_ids)
+        print(self.input_mask == other.attention_mask)
+        print(self.p_mask == other.p_mask)
+        print(self.paragraph_len == other.paragraph_len)
+        print(self.segment_ids == other.token_type_ids)
+        print(self.token_is_max_context == other.token_is_max_context)
+        print(self.token_to_orig_map == other.token_to_orig_map)
+        print(self.tokens == other.tokens)
+
+        return self.example_index == other.example_index and \
                 self.input_ids == other.input_ids and \
-                self.input_mask == other.input_mask and \
-                self.is_impossible == other.is_impossible and \
+                self.input_mask == other.attention_mask and \
                 self.p_mask == other.p_mask and \
                 self.paragraph_len == other.paragraph_len and \
-                self.segment_ids == other.segment_ids and \
-                self.start_position == other.start_position and \
+                self.segment_ids == other.token_type_ids and \
                 self.token_is_max_context == other.token_is_max_context and \
                 self.token_to_orig_map == other.token_to_orig_map and \
-                self.tokens == other.tokens and \
-                self.unique_id == other.unique_id
\ No newline at end of file
+                self.tokens == other.tokens
\ No newline at end of file

From c3ba6452377f085d0f59e15b97ac247bca24367e Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 22 Nov 2019 14:36:49 -0500
Subject: [PATCH 06/91] Works for XLNet

---
 examples/run_squad.py                 | 38 ++++--------
 transformers/data/processors/squad.py | 84 +++++++++++++--------------
 2 files changed, 50 insertions(+), 72 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index d4219c3096..634b566a46 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -16,6 +16,7 @@
 """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
+from transformers.data.processors.squad import SquadV1Processor
 
 import argparse
 import logging
@@ -46,8 +47,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
 
 from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples
 
-from utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
+from utils_squad import (RawResult, write_predictions,
                          RawResultExtended, write_predictions_extended)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
@@ -289,7 +289,6 @@ def evaluate(args, model, tokenizer, prefix=""):
     results = evaluate_on_squad(evaluate_options)
     return results
 
-
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
     if args.local_rank not in [-1, 0] and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
@@ -308,9 +307,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         examples = read_squad_examples(input_file=input_file,
                                                 is_training=not evaluate,
                                                 version_2_with_negative=args.version_2_with_negative)
-
-        examples = examples[:10]
-        features = convert_examples_to_features(examples=examples,
+        keep_n_examples = 1000
+        processor = SquadV1Processor()
+        values = processor.get_dev_examples("examples/squad")
+        examples = values[:keep_n_examples]
+        features = squad_convert_examples_to_features(examples=exampless,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=args.max_seq_length,
                                                 doc_stride=args.doc_stride,
@@ -320,29 +321,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                                                 pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
                                                 cls_token_at_end=True if args.model_type in ['xlnet'] else False,
                                                 sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
-
-        exampless = sread_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        exampless = exampless[:10]
-        features2 = squad_convert_examples_to_features(examples=exampless,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate,
-                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
-                                                pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
-                                                cls_token_at_end=True if args.model_type in ['xlnet'] else False,
-                                                sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
-
-        print(features2)
-
-        for i in range(len(features)):
-            assert features[i] == features2[i]
-            print("Equal")
-
         print("DONE")
+
+        import sys
+        sys.exit()
         
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index a0f2408a16..fb3d2ae4d4 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -83,6 +83,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                                        sequence_a_is_doc=False):
     """Loads a data file into a list of `InputBatch`s."""
 
+    cls_token = tokenizer.cls_token
+    sep_token = tokenizer.sep_token
+
     # Defining helper methods    
     unique_id = 1000000000
 
@@ -136,24 +139,24 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
 
         encoded_dict = tokenizer.encode_plus(
-            truncated_query, 
-            all_doc_tokens, 
+            truncated_query if not sequence_a_is_doc else all_doc_tokens, 
+            all_doc_tokens if not sequence_a_is_doc else truncated_query, 
             max_length=max_seq_length, 
             padding_strategy='right',
             stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
             return_overflowing_tokens=True, 
-            truncation_strategy='only_second'
+            truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first'
         )
 
         ids = encoded_dict['input_ids']
-        print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None)
         non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids
         paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
         tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
 
         token_to_orig_map = {}
         for i in range(paragraph_len):
-            token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[0 + i]
+            index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i 
+            token_to_orig_map[index] = tok_to_orig_index[0 + i]
 
         encoded_dict["paragraph_len"] = paragraph_len
         encoded_dict["tokens"] = tokens
@@ -164,35 +167,40 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         encoded_dict["length"] = paragraph_len
 
         spans.append(encoded_dict)
-        print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict)
+        # print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict)
+
         while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict:
-            
-            overflowing_tokens = encoded_dict['overflowing_tokens']
-
-            print("OVERFLOW", len(overflowing_tokens))
-
+            overflowing_tokens = encoded_dict["overflowing_tokens"]
             encoded_dict = tokenizer.encode_plus(
-                truncated_query, 
-                overflowing_tokens, 
+                truncated_query if not sequence_a_is_doc else overflowing_tokens, 
+                overflowing_tokens if not sequence_a_is_doc else truncated_query, 
                 max_length=max_seq_length, 
                 return_overflowing_tokens=True, 
                 padding_strategy='right',
                 stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy='only_second'
+                truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first'
             )
-
             ids = encoded_dict['input_ids']
-            print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None)
+            # print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None)
+
+            # print(encoded_dict["input_ids"].index(tokenizer.pad_token_id) if tokenizer.pad_token_id in encoded_dict["input_ids"] else None)
+            # print(len(spans) * doc_stride, len(all_doc_tokens))
+            
 
             # Length of the document without the query
             paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
 
-            non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            if tokenizer.pad_token_id in encoded_dict['input_ids']: 
+                non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            else:
+                non_padded_ids = encoded_dict['input_ids']
+
             tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
 
             token_to_orig_map = {}
             for i in range(paragraph_len):
-                token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[len(spans) * doc_stride + i]
+                index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i 
+                token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
 
             encoded_dict["paragraph_len"] = paragraph_len
             encoded_dict["tokens"] = tokens
@@ -202,23 +210,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             encoded_dict["start"] = len(spans) * doc_stride
             encoded_dict["length"] = paragraph_len
 
-            # split_token_index = doc_span.start + i
-            # token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-            # is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-            #                                         split_token_index)
-            # token_is_max_context[len(tokens)] = is_max_context
-            # tokens.append(all_doc_tokens[split_token_index])
-
             spans.append(encoded_dict)
 
         for doc_span_index in range(len(spans)):
             for j in range(spans[doc_span_index]["paragraph_len"]):
                 is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-                index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                index = j if sequence_a_is_doc else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
                 spans[doc_span_index]["token_is_max_context"][index] = is_max_context
 
-        print("new span", len(spans))
         for span in spans:
             # Identify the position of the CLS token
             cls_index = span['input_ids'].index(tokenizer.cls_token_id)
@@ -227,17 +226,17 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             # Original TF implem also keep the classification token (set to 0) (not sure why...)
             p_mask = np.array(span['token_type_ids'])
 
-            # Convert all SEP indices to '0' before inversion
-            p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 0
+            p_mask = np.minimum(p_mask, 1)
 
-            # Limit positive values to one
-            p_mask = 1 - np.minimum(p_mask, 1)
+            if not sequence_a_is_doc:
+                # Limit positive values to one
+                p_mask = 1 - p_mask
+
+            p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
 
             # Set the CLS index to '0'
             p_mask[cls_index] = 0
 
-            print("new features length", len(new_features))
-
             new_features.append(NewSquadFeatures(
                 span['input_ids'],
                 span['attention_mask'],
@@ -287,19 +286,15 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         doc_spans = []
         start_offset = 0
         while start_offset < len(all_doc_tokens):
-            print("OLD DOC CREATION BEGIN", start_offset, len(all_doc_tokens))
             length = len(all_doc_tokens) - start_offset
             if length > max_tokens_for_doc:
                 length = max_tokens_for_doc
+            # print("Start offset is", start_offset, len(all_doc_tokens), "length is", length)
             doc_spans.append(_DocSpan(start=start_offset, length=length))
             if start_offset + length == len(all_doc_tokens):
-                print("Done with this doc span, breaking out.", start_offset, length)
                 break
-            print("CHOOSING OFFSET", length, doc_stride)
             start_offset += min(length, doc_stride)
-            print("OLD DOC CREATION END", start_offset)
 
-        print("old span", len(doc_spans))
         for (doc_span_index, doc_span) in enumerate(doc_spans):
             tokens = []
             token_to_orig_map = {}
@@ -382,7 +377,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 input_mask.append(0 if mask_padding_with_zero else 1)
                 segment_ids.append(pad_token_segment_id)
                 p_mask.append(1)
-            print("[OLD] Ids computed; position of the first padding", input_ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in input_ids else None)
+
             assert len(input_ids) == max_seq_length
             assert len(input_mask) == max_seq_length
             assert len(segment_ids) == max_seq_length
@@ -440,7 +435,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             #         logger.info(
             #             "answer: %s" % (answer_text))
 
-            print("features length", len(features))
             features.append(
                 SquadFeatures(
                     unique_id=unique_id,
@@ -464,10 +458,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
     assert len(features) == len(new_features)
     for i in range(len(features)):
-        print(i)
         feature, new_feature = features[i], new_features[i]
         
-        input_ids = feature.input_ids
+        input_ids = [f if f not in [3,4,5] else 0 for f in feature.input_ids ]
         input_mask = feature.input_mask
         segment_ids = feature.segment_ids
         cls_index = feature.cls_index
@@ -478,7 +471,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         tokens = feature.tokens
         token_to_orig_map = feature.token_to_orig_map
               
-        new_input_ids = new_feature.input_ids
+        new_input_ids = [f if f not in [3,4,5] else 0 for f in new_feature.input_ids]
         new_input_mask = new_feature.attention_mask
         new_segment_ids = new_feature.token_type_ids
         new_cls_index = new_feature.cls_index
@@ -497,6 +490,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         assert example_index == new_example_index
         assert paragraph_len == new_paragraph_len
         assert token_is_max_context == new_token_is_max_context
+
+        tokens = [t if tokenizer.convert_tokens_to_ids(t) is not tokenizer.unk_token_id else tokenizer.unk_token for t in tokens]
+
         assert tokens == new_tokens
         assert token_to_orig_map == new_token_to_orig_map
 

From e0e55bc550a16289763b4f656790e30ed86e428f Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 22 Nov 2019 16:18:18 -0500
Subject: [PATCH 07/91] Manage training example & refactor the refactor

---
 transformers/data/processors/squad.py | 368 ++++----------------------
 1 file changed, 51 insertions(+), 317 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index fb3d2ae4d4..3d8f48c1bb 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -92,31 +92,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
     features = []
     new_features = []
     for (example_index, example) in enumerate(tqdm(examples)):
-
-        doc_tokens = []
-        char_to_word_offset = []
-        prev_is_whitespace = True
-
-        # Split on whitespace so that different tokens may be attributed to their original position.
-        for c in example.context_text:
-            if _is_whitespace(c):
-                prev_is_whitespace = True
-            else:
-                if prev_is_whitespace:
-                    doc_tokens.append(c)
-                else:
-                    doc_tokens[-1] += c
-                prev_is_whitespace = False
-            char_to_word_offset.append(len(doc_tokens) - 1)
-
         if is_training:
             # Get start and end position
             answer_length = len(example.answer_text)
-            start_position = char_to_word_offset[example.start_position]
-            end_position = char_to_word_offset[example.start_position + answer_length - 1]
+            start_position = example.start_position
+            end_position = example.end_position
 
             # If the answer cannot be found in the text, then skip this example.
-            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+            actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
             cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
             if actual_text.find(cleaned_answer_text) == -1:
                 logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
@@ -125,7 +108,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         tok_to_orig_index = []
         orig_to_tok_index = []
         all_doc_tokens = []
-        for (i, token) in enumerate(doc_tokens):
+        for (i, token) in enumerate(example.doc_tokens):
             orig_to_tok_index.append(len(all_doc_tokens))
             sub_tokens = tokenizer.tokenize(token)
             for sub_token in sub_tokens:
@@ -138,56 +121,19 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence 
         sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
 
-        encoded_dict = tokenizer.encode_plus(
-            truncated_query if not sequence_a_is_doc else all_doc_tokens, 
-            all_doc_tokens if not sequence_a_is_doc else truncated_query, 
-            max_length=max_seq_length, 
-            padding_strategy='right',
-            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            return_overflowing_tokens=True, 
-            truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first'
-        )
-
-        ids = encoded_dict['input_ids']
-        non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids
-        paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
-        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
-
-        token_to_orig_map = {}
-        for i in range(paragraph_len):
-            index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i 
-            token_to_orig_map[index] = tok_to_orig_index[0 + i]
-
-        encoded_dict["paragraph_len"] = paragraph_len
-        encoded_dict["tokens"] = tokens
-        encoded_dict["token_to_orig_map"] = token_to_orig_map
-        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
-        encoded_dict["token_is_max_context"] = {}
-        encoded_dict["start"] = 0
-        encoded_dict["length"] = paragraph_len
-
-        spans.append(encoded_dict)
-        # print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict)
-
-        while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict:
-            overflowing_tokens = encoded_dict["overflowing_tokens"]
+        span_doc_tokens = all_doc_tokens
+        while len(spans) * doc_stride < len(all_doc_tokens):
+            
             encoded_dict = tokenizer.encode_plus(
-                truncated_query if not sequence_a_is_doc else overflowing_tokens, 
-                overflowing_tokens if not sequence_a_is_doc else truncated_query, 
+                truncated_query if not sequence_a_is_doc else span_doc_tokens, 
+                span_doc_tokens if not sequence_a_is_doc else truncated_query, 
                 max_length=max_seq_length, 
                 return_overflowing_tokens=True, 
                 padding_strategy='right',
                 stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
                 truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first'
             )
-            ids = encoded_dict['input_ids']
-            # print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None)
 
-            # print(encoded_dict["input_ids"].index(tokenizer.pad_token_id) if tokenizer.pad_token_id in encoded_dict["input_ids"] else None)
-            # print(len(spans) * doc_stride, len(all_doc_tokens))
-            
-
-            # Length of the document without the query
             paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
 
             if tokenizer.pad_token_id in encoded_dict['input_ids']: 
@@ -212,6 +158,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             spans.append(encoded_dict)
 
+            if "overflowing_tokens" not in encoded_dict:
+                break
+            span_doc_tokens = encoded_dict["overflowing_tokens"]
+
         for doc_span_index in range(len(spans)):
             for j in range(spans[doc_span_index]["paragraph_len"]):
                 is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
@@ -254,249 +204,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             unique_id += 1
 
-        # tokenize ...
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            # print("Start offset is", start_offset, len(all_doc_tokens), "length is", length)
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-
-            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-            # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = []
-
-            # CLS token at the beginning
-            if not cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = 0
-
-            # XLNet: P SEP Q SEP CLS
-            # Others: CLS Q SEP P SEP
-            if not sequence_a_is_doc:
-                # Query
-                tokens += query_tokens
-                segment_ids += [sequence_a_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
-
-                # SEP token
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-            # Paragraph
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                if not sequence_a_is_doc:
-                    segment_ids.append(sequence_b_segment_id)
-                else:
-                    segment_ids.append(sequence_a_segment_id)
-                p_mask.append(0)
-            paragraph_len = doc_span.length
-
-            if sequence_a_is_doc:
-                # SEP token
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-                tokens += query_tokens
-                segment_ids += [sequence_b_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
-
-            # SEP token
-            tokens.append(sep_token)
-            segment_ids.append(sequence_b_segment_id)
-            p_mask.append(1)
-
-            # CLS token at the end
-            if cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = len(tokens) - 1  # Index of classification token
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-
-            
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(pad_token)
-                input_mask.append(0 if mask_padding_with_zero else 1)
-                segment_ids.append(pad_token_segment_id)
-                p_mask.append(1)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            span_is_impossible = example.is_impossible if hasattr(example, "is_impossible") else False
-            start_position = None
-            end_position = None
-            if is_training and not span_is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                    span_is_impossible = True
-                else:
-                    if sequence_a_is_doc:
-                        doc_offset = 0
-                    else:
-                        doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-
-            if is_training and span_is_impossible:
-                start_position = cls_index
-                end_position = cls_index
-
-            # if example_index < 20:
-            #     logger.info("*** Example ***")
-            #     logger.info("unique_id: %s" % (unique_id))
-            #     logger.info("example_index: %s" % (example_index))
-            #     logger.info("doc_span_index: %s" % (doc_span_index))
-            #     logger.info("tokens: %s" % str(tokens))
-            #     logger.info("token_to_orig_map: %s" % " ".join([
-            #         "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-            #     logger.info("token_is_max_context: %s" % " ".join([
-            #         "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-            #     ]))
-            #     logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            #     logger.info(
-            #         "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            #     logger.info(
-            #         "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-            #     if is_training and span_is_impossible:
-            #         logger.info("impossible example")
-            #     if is_training and not span_is_impossible:
-            #         answer_text = " ".join(tokens[start_position:(end_position + 1)])
-            #         logger.info("start_position: %d" % (start_position))
-            #         logger.info("end_position: %d" % (end_position))
-            #         logger.info(
-            #             "answer: %s" % (answer_text))
-
-            features.append(
-                SquadFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    cls_index=cls_index,
-                    p_mask=p_mask,
-                    paragraph_len=paragraph_len,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=span_is_impossible))
-            unique_id += 1
-
-        assert len(features) == len(new_features)
-
-    assert len(features) == len(new_features)
-    for i in range(len(features)):
-        feature, new_feature = features[i], new_features[i]
-        
-        input_ids = [f if f not in [3,4,5] else 0 for f in feature.input_ids ]
-        input_mask = feature.input_mask
-        segment_ids = feature.segment_ids
-        cls_index = feature.cls_index
-        p_mask = feature.p_mask
-        example_index = feature.example_index
-        paragraph_len = feature.paragraph_len
-        token_is_max_context = feature.token_is_max_context
-        tokens = feature.tokens
-        token_to_orig_map = feature.token_to_orig_map
-              
-        new_input_ids = [f if f not in [3,4,5] else 0 for f in new_feature.input_ids]
-        new_input_mask = new_feature.attention_mask
-        new_segment_ids = new_feature.token_type_ids
-        new_cls_index = new_feature.cls_index
-        new_p_mask = new_feature.p_mask
-        new_example_index = new_feature.example_index
-        new_paragraph_len = new_feature.paragraph_len
-        new_token_is_max_context = new_feature.token_is_max_context
-        new_tokens = new_feature.tokens
-        new_token_to_orig_map = new_feature.token_to_orig_map
-
-        assert input_ids == new_input_ids
-        assert input_mask == new_input_mask
-        assert segment_ids == new_segment_ids
-        assert cls_index == new_cls_index
-        assert p_mask == new_p_mask
-        assert example_index == new_example_index
-        assert paragraph_len == new_paragraph_len
-        assert token_is_max_context == new_token_is_max_context
-
-        tokens = [t if tokenizer.convert_tokens_to_ids(t) is not tokenizer.unk_token_id else tokenizer.unk_token for t in tokens]
-
-        assert tokens == new_tokens
-        assert token_to_orig_map == new_token_to_orig_map
-
-
     return new_features
 
 
@@ -592,35 +299,35 @@ class SquadV1Processor(DataProcessor):
             tensor_dict['title'].numpy().decode('utf-8')
         )
 
-    def get_train_examples(self, data_dir):
+    def get_train_examples(self, data_dir, only_first=None):
         """See base class."""
         with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "train")
+        return self._create_examples(input_data, "train", only_first)
 
-    def get_dev_examples(self, data_dir):
+    def get_dev_examples(self, data_dir, only_first=None):
         """See base class."""
         with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "dev")
+        return self._create_examples(input_data, "dev", only_first)
 
     def get_labels(self):
         """See base class."""
         return ["0", "1"]
 
-    def _create_examples(self, input_data, set_type):
+    def _create_examples(self, input_data, set_type, only_first=None):
         """Creates examples for the training and dev sets."""
         
         is_training = set_type == "train"
         examples = []
-        for entry in input_data:
+        for entry in tqdm(input_data):
             title = entry['title']
             for paragraph in entry["paragraphs"]:
                 context_text = paragraph["context"]
                 for qa in paragraph["qas"]:
                     qas_id = qa["id"]
                     question_text = qa["question"]
-                    start_position = None
+                    start_position_character = None
                     answer_text = None
                     if is_training:
                         if (len(qa["answers"]) != 1):
@@ -628,17 +335,20 @@ class SquadV1Processor(DataProcessor):
                                 "For training, each question should have exactly 1 answer.")
                         answer = qa["answers"][0]
                         answer_text = answer['text']
-                        start_position = answer['answer_start']
+                        start_position_character = answer['answer_start']
 
                     example = NewSquadExample(
                         qas_id=qas_id,
                         question_text=question_text,
                         context_text=context_text,
                         answer_text=answer_text,
-                        start_position=start_position,
+                        start_position_character=start_position_character,
                         title=title
                     )
                     examples.append(example)
+
+                    if only_first is not None and len(examples) > only_first:
+                        return examples
         return examples
         
 
@@ -653,14 +363,38 @@ class NewSquadExample(object):
                  question_text,
                  context_text,
                  answer_text,
-                 start_position,
+                 start_position_character,
                  title):
         self.qas_id = qas_id
         self.question_text = question_text
         self.context_text = context_text
         self.answer_text = answer_text
-        self.start_position = start_position
         self.title = title
+        self.is_impossible = False
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start end end positions only has a value during evaluation.
+        if start_position_character is not None:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
 
 
 class NewSquadFeatures(object):

From 0669c1fcd15051ec6fe2d950079886faccf2fb33 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 25 Nov 2019 19:22:21 -0500
Subject: [PATCH 08/91] SQuAD v2 BERT + XLNet

---
 transformers/__init__.py                 |   2 +-
 transformers/data/__init__.py            |   2 +-
 transformers/data/processors/__init__.py |   2 +-
 transformers/data/processors/squad.py    | 180 +++++++++++------------
 4 files changed, 92 insertions(+), 94 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 9a767913b3..f3f81f1dbe 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -27,7 +27,7 @@ from .data import (is_sklearn_available,
                    glue_output_modes, glue_convert_examples_to_features,
                    glue_processors, glue_tasks_num_labels,
                    squad_convert_examples_to_features, SquadFeatures, 
-                   SquadExample, read_squad_examples)
+                   SquadExample)
 
 if is_sklearn_available():
     from .data import glue_compute_metrics
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index 50f2e768f4..b351bf625e 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,6 +1,6 @@
 from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples
+from .processors import squad_convert_examples_to_features, SquadExample
 
 from .metrics import is_sklearn_available
 if is_sklearn_available():
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index 924b4a1245..1e52776629 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples
+from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample
 
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 3d8f48c1bb..39ee00ae56 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -46,7 +46,6 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 
     return cur_span_index == best_span_index
 
-
 def _new_check_is_max_context(doc_spans, cur_span_index, position):
     """Check if this is the 'max context' doc span for the token."""
     # if len(doc_spans) == 1:
@@ -92,7 +91,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
     features = []
     new_features = []
     for (example_index, example) in enumerate(tqdm(examples)):
-        if is_training:
+        if is_training and not example.is_impossible:
             # Get start and end position
             answer_length = len(example.answer_text)
             start_position = example.start_position
@@ -105,6 +104,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                 continue
 
+
         tok_to_orig_index = []
         orig_to_tok_index = []
         all_doc_tokens = []
@@ -115,6 +115,18 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 tok_to_orig_index.append(i)
                 all_doc_tokens.append(sub_token)
 
+
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+            )
+
         spans = []
         
         truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
@@ -187,6 +199,34 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             # Set the CLS index to '0'
             p_mask[cls_index] = 0
 
+
+            span_is_impossible = example.is_impossible
+            start_position = 0
+            end_position = 0
+            if is_training and not span_is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = span["start"]
+                doc_end = span["start"] + span["length"] - 1
+                out_of_span = False
+
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                    out_of_span = True
+
+                if out_of_span:
+                    start_position = cls_index
+                    end_position = cls_index
+                    span_is_impossible = True
+                else:
+                    if sequence_a_is_doc:
+                        doc_offset = 0
+                    else:
+                        doc_offset = len(truncated_query) + sequence_added_tokens
+                        
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+
             new_features.append(NewSquadFeatures(
                 span['input_ids'],
                 span['attention_mask'],
@@ -199,7 +239,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 paragraph_len=span['paragraph_len'],
                 token_is_max_context=span["token_is_max_context"],
                 tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"]
+                token_to_orig_map=span["token_to_orig_map"],
+                
+                start_position=start_position,
+                end_position=end_position
             ))
 
             unique_id += 1
@@ -207,86 +250,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
     return new_features
 
 
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-
-
-class SquadV1Processor(DataProcessor):
+class SquadProcessor(DataProcessor):
     """Processor for the SQuAD data set."""
+    train_file = None
+    dev_file = None
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
@@ -301,13 +268,19 @@ class SquadV1Processor(DataProcessor):
 
     def get_train_examples(self, data_dir, only_first=None):
         """See base class."""
-        with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader:
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "train", only_first)
 
     def get_dev_examples(self, data_dir, only_first=None):
         """See base class."""
-        with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader:
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+        
+        with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev", only_first)
 
@@ -329,7 +302,13 @@ class SquadV1Processor(DataProcessor):
                     question_text = qa["question"]
                     start_position_character = None
                     answer_text = None
-                    if is_training:
+                    
+                    if "is_impossible" in qa:
+                        is_impossible = qa["is_impossible"]
+                    else:
+                        is_impossible = False
+
+                    if not is_impossible and is_training:
                         if (len(qa["answers"]) != 1):
                             raise ValueError(
                                 "For training, each question should have exactly 1 answer.")
@@ -343,15 +322,25 @@ class SquadV1Processor(DataProcessor):
                         context_text=context_text,
                         answer_text=answer_text,
                         start_position_character=start_position_character,
-                        title=title
+                        title=title,
+                        is_impossible=is_impossible
                     )
+
                     examples.append(example)
 
                     if only_first is not None and len(examples) > only_first:
                         return examples
         return examples
-        
 
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+    
 
 class NewSquadExample(object):
     """
@@ -364,13 +353,16 @@ class NewSquadExample(object):
                  context_text,
                  answer_text,
                  start_position_character,
-                 title):
+                 title,
+                 is_impossible=False):
         self.qas_id = qas_id
         self.question_text = question_text
         self.context_text = context_text
         self.answer_text = answer_text
         self.title = title
-        self.is_impossible = False
+        self.is_impossible = is_impossible 
+
+        self.start_position, self.end_position = 0, 0
 
         doc_tokens = []
         char_to_word_offset = []
@@ -392,7 +384,7 @@ class NewSquadExample(object):
         self.char_to_word_offset = char_to_word_offset
 
         # Start end end positions only has a value during evaluation.
-        if start_position_character is not None:
+        if start_position_character is not None and not is_impossible:
             self.start_position = char_to_word_offset[start_position_character]
             self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
 
@@ -415,7 +407,10 @@ class NewSquadFeatures(object):
                  paragraph_len,
                  token_is_max_context,
                  tokens,
-                 token_to_orig_map
+                 token_to_orig_map,
+
+                 start_position,
+                 end_position
         ):
         self.input_ids = input_ids 
         self.attention_mask = attention_mask
@@ -430,6 +425,9 @@ class NewSquadFeatures(object):
         self.tokens = tokens
         self.token_to_orig_map = token_to_orig_map
 
+        self.start_position = start_position
+        self.end_position = end_position
+
 class SquadExample(object):
     """
     A single training/test example for the Squad dataset.

From bd41e8292a4bd7db10eb036112019d93c50adcf5 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 28 Nov 2019 16:03:56 -0500
Subject: [PATCH 09/91] Cleanup & Evaluation now works

---
 examples/run_squad.py                 | 44 +++++++++++----------------
 transformers/data/processors/squad.py | 14 ++-------
 2 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 634b566a46..545c3ad55a 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -16,7 +16,7 @@
 """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
-from transformers.data.processors.squad import SquadV1Processor
+from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor
 
 import argparse
 import logging
@@ -45,9 +45,9 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                   XLNetTokenizer,
                                   DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 
-from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples
+from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
 
-from utils_squad import (RawResult, write_predictions,
+from utils_squad import (convert_examples_to_features as old_convert, read_squad_examples as old_read, RawResult, write_predictions,
                          RawResultExtended, write_predictions_extended)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
@@ -304,28 +304,20 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        keep_n_examples = 1000
-        processor = SquadV1Processor()
-        values = processor.get_dev_examples("examples/squad")
-        examples = values[:keep_n_examples]
-        features = squad_convert_examples_to_features(examples=exampless,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate,
-                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
-                                                pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
-                                                cls_token_at_end=True if args.model_type in ['xlnet'] else False,
-                                                sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
-        print("DONE")
 
-        import sys
-        sys.exit()
-        
+        processor = SquadV2Processor()
+        examples = processor.get_dev_examples("examples/squad") if evaluate else processor.get_train_examples("examples/squad")
+        features = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+            sequence_a_is_doc=True if args.model_type in ['xlnet'] else False
+        )
+
+
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -335,8 +327,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
     all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
     all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
     if evaluate:
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 39ee00ae56..3d5a3eca80 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -74,26 +74,16 @@ def _is_whitespace(c):
 
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                                        doc_stride, max_query_length, is_training,
-                                       cls_token_at_end=True,
-                                       cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                       sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                       cls_token_segment_id=0, pad_token_segment_id=0,
-                                       mask_padding_with_zero=True,
                                        sequence_a_is_doc=False):
     """Loads a data file into a list of `InputBatch`s."""
 
-    cls_token = tokenizer.cls_token
-    sep_token = tokenizer.sep_token
-
     # Defining helper methods    
     unique_id = 1000000000
 
     features = []
-    new_features = []
     for (example_index, example) in enumerate(tqdm(examples)):
         if is_training and not example.is_impossible:
             # Get start and end position
-            answer_length = len(example.answer_text)
             start_position = example.start_position
             end_position = example.end_position
 
@@ -227,7 +217,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                     end_position = tok_end_position - doc_start + doc_offset
 
 
-            new_features.append(NewSquadFeatures(
+            features.append(NewSquadFeatures(
                 span['input_ids'],
                 span['attention_mask'],
                 span['token_type_ids'],
@@ -247,7 +237,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             unique_id += 1
 
-    return new_features
+    return features
 
 
 class SquadProcessor(DataProcessor):

From f671997ef74199823db83ed7b43340764888e129 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 28 Nov 2019 17:17:20 -0500
Subject: [PATCH 10/91] Interface with TFDS

---
 transformers/data/processors/squad.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 3d5a3eca80..52c2c28add 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -246,16 +246,24 @@ class SquadProcessor(DataProcessor):
     dev_file = None
 
     def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
         return NewSquadExample(
-            tensor_dict['id'].numpy(),
+            tensor_dict['id'].numpy().decode("utf-8"),
             tensor_dict['question'].numpy().decode('utf-8'),
             tensor_dict['context'].numpy().decode('utf-8'),
-            tensor_dict['answers']['text'].numpy().decode('utf-8'),
-            tensor_dict['answers']['answers_start'].numpy().decode('utf-8'),
+            tensor_dict['answers']['text'][0].numpy().decode('utf-8'),
+            tensor_dict['answers']['answer_start'][0].numpy(),
             tensor_dict['title'].numpy().decode('utf-8')
         )
 
+    def get_examples_from_dataset(self, dataset):
+        """See base class."""
+
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self.get_example_from_tensor_dict(tensor_dict)) 
+
+        return examples
+
     def get_train_examples(self, data_dir, only_first=None):
         """See base class."""
         if self.train_file is None:

From 0b84b9fd8a728ca46e4109aa38a11b25f87a09bf Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 28 Nov 2019 17:38:52 -0500
Subject: [PATCH 11/91] Add processors to __init__

---
 transformers/__init__.py                 | 2 +-
 transformers/data/__init__.py            | 2 +-
 transformers/data/processors/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index f3f81f1dbe..aefa3f1921 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -27,7 +27,7 @@ from .data import (is_sklearn_available,
                    glue_output_modes, glue_convert_examples_to_features,
                    glue_processors, glue_tasks_num_labels,
                    squad_convert_examples_to_features, SquadFeatures, 
-                   SquadExample)
+                   SquadExample, SquadV1Processor, SquadV2Processor)
 
 if is_sklearn_available():
     from .data import glue_compute_metrics
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index b351bf625e..ea3a4e9fbb 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,6 +1,6 @@
 from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .processors import squad_convert_examples_to_features, SquadExample
+from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
 
 from .metrics import is_sklearn_available
 if is_sklearn_available():
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index 1e52776629..2470e7a06d 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample
+from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
 

From 1e9ac5a7cfeb48ff6a1cf20e07941fc8c59b391d Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 28 Nov 2019 17:43:47 -0500
Subject: [PATCH 12/91] New -> normal

---
 transformers/data/processors/squad.py | 106 ++------------------------
 1 file changed, 5 insertions(+), 101 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 52c2c28add..f414d41925 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -217,7 +217,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                     end_position = tok_end_position - doc_start + doc_offset
 
 
-            features.append(NewSquadFeatures(
+            features.append(SquadFeatures(
                 span['input_ids'],
                 span['attention_mask'],
                 span['token_type_ids'],
@@ -246,7 +246,7 @@ class SquadProcessor(DataProcessor):
     dev_file = None
 
     def get_example_from_tensor_dict(self, tensor_dict):
-        return NewSquadExample(
+        return SquadExample(
             tensor_dict['id'].numpy().decode("utf-8"),
             tensor_dict['question'].numpy().decode('utf-8'),
             tensor_dict['context'].numpy().decode('utf-8'),
@@ -314,7 +314,7 @@ class SquadProcessor(DataProcessor):
                         answer_text = answer['text']
                         start_position_character = answer['answer_start']
 
-                    example = NewSquadExample(
+                    example = SquadExample(
                         qas_id=qas_id,
                         question_text=question_text,
                         context_text=context_text,
@@ -340,7 +340,7 @@ class SquadV2Processor(SquadProcessor):
     dev_file = "dev-v2.0.json"
     
 
-class NewSquadExample(object):
+class SquadExample(object):
     """
     A single training/test example for the Squad dataset, as loaded from disk.
     """
@@ -387,7 +387,7 @@ class NewSquadExample(object):
             self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
 
 
-class NewSquadFeatures(object):
+class SquadFeatures(object):
     """
     Single squad example features to be fed to a model.
     Those features are model-specific.
@@ -425,99 +425,3 @@ class NewSquadFeatures(object):
 
         self.start_position = start_position
         self.end_position = end_position
-
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
-    """
-
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class SquadFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 cls_index,
-                 p_mask,
-                 paragraph_len,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-        self.paragraph_len = paragraph_len
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __eq__(self, other):
-        print(self.example_index == other.example_index)
-        print(self.input_ids == other.input_ids)
-        print(self.input_mask == other.attention_mask)
-        print(self.p_mask == other.p_mask)
-        print(self.paragraph_len == other.paragraph_len)
-        print(self.segment_ids == other.token_type_ids)
-        print(self.token_is_max_context == other.token_is_max_context)
-        print(self.token_to_orig_map == other.token_to_orig_map)
-        print(self.tokens == other.tokens)
-
-        return self.example_index == other.example_index and \
-                self.input_ids == other.input_ids and \
-                self.input_mask == other.attention_mask and \
-                self.p_mask == other.p_mask and \
-                self.paragraph_len == other.paragraph_len and \
-                self.segment_ids == other.token_type_ids and \
-                self.token_is_max_context == other.token_is_max_context and \
-                self.token_to_orig_map == other.token_to_orig_map and \
-                self.tokens == other.tokens
\ No newline at end of file

From 285b1241e38cdafb6b0dadd1d1afc19493318074 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 3 Dec 2019 15:00:49 -0500
Subject: [PATCH 13/91] Added SquadResult

---
 transformers/data/processors/squad.py | 71 +++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index f414d41925..afbe4270f5 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -425,3 +425,74 @@ class SquadFeatures(object):
 
         self.start_position = start_position
         self.end_position = end_position
+
+
+
+class SquadResult(object):
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        result: The result output by a model on a SQuAD inference. These results may be complex (5 values) as the ones output by
+            XLNet or XLM or may be simple like the other models (2 values). They may be passed as a list or as a dict, with the 
+            following accepted formats:
+
+            `dict` output by a simple model:
+                {
+                    "start_logits": int,
+                    "end_logits": int,
+                    "unique_id": string
+                }
+            `list` output by a simple model:
+                [start_logits, end_logits, unique_id]
+
+            `dict` output by a complex model:
+                {
+                    "start_top_log_probs": float,
+                    "start_top_index": int,
+                    "end_top_log_probs": float,
+                    "end_top_index": int,
+                    "cls_logits": int,
+                    "unique_id": string
+                }
+            `list` output by a complex model:
+                [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, unique_id]
+
+            See `run_squad.py` for an example.
+    """
+    def __init__(self, result):
+        if isinstance(result, dict):
+            if "start_logits" in result and "end_logits" in result:
+                self.start_logits = result["start_logits"]
+                self.end_logits = result["end_logits"]
+
+            elif "start_top_log_probs" in result and "start_top_index" in result:
+                self.start_top_log_probs  = result["start_top_log_probs"]
+                self.start_top_index      = result["start_top_index"]
+                self.end_top_log_probs    = result["end_top_log_probs"]
+                self.end_top_index        = result["end_top_index"]
+                self.cls_logits           = result["cls_logits"]
+
+            else:
+                raise ValueError("SquadResult instantiated with wrong values.")
+
+            self.unique_id = result["unique_id"]
+        elif isinstance(result, list):            
+            if len(result) == 3:
+                self.start_logits = result[0]
+                self.end_logits = result[1]
+
+            elif len(result) == 6:
+                self.start_top_log_probs  = result[0]
+                self.start_top_index      = result[1]
+                self.end_top_log_probs    = result[2]
+                self.end_top_index        = result[3]
+                self.cls_logits           = result[4]
+
+            else:
+                raise ValueError("SquadResult instantiated with wrong values.")
+
+            self.unique_id = result[-1]
+
+        else:
+            raise ValueError("SquadResult instantiated with wrong values. Should be a dictionary or a list.")

From c835bc85c2f51f4da5eab4f1481a25b052bf6d61 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 3 Dec 2019 15:28:16 -0500
Subject: [PATCH 14/91] Compute predictions

---
 transformers/data/metrics/squad_metrics.py | 335 +++++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 transformers/data/metrics/squad_metrics.py

diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
new file mode 100644
index 0000000000..d4c5a8ec5b
--- /dev/null
+++ b/transformers/data/metrics/squad_metrics.py
@@ -0,0 +1,335 @@
+import json
+import logging
+import math
+import collections
+from io import open
+from tqdm import tqdm
+
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+
+logger = logging.getLogger(__name__)
+
+
+def compute_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+                      version_2_with_negative, null_score_diff_threshold):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+                
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest)==1:
+                nbest.insert(0,
+                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs

From de276de1c1a469a58a25383a35a239d02459a978 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 3 Dec 2019 17:15:51 -0500
Subject: [PATCH 15/91] Working evaluation

---
 examples/run_squad.py                      |  43 +-
 transformers/data/metrics/squad_metrics.py | 588 +++++++++++++++++----
 transformers/data/processors/squad.py      |  19 +-
 3 files changed, 507 insertions(+), 143 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 545c3ad55a..b7952487dc 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -16,7 +16,8 @@
 """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
-from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor
+from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
+from transformers.data.metrics.squad_metrics import compute_predictions, compute_predictions_extended, squad_evaluate
 
 import argparse
 import logging
@@ -230,9 +231,11 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
+            inputs = {
+                'input_ids':      batch[0],
+                'attention_mask': batch[1]
+            }
+            
             if args.model_type != 'distilbert':
                 inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
             example_indices = batch[3]
@@ -244,18 +247,8 @@ def evaluate(args, model, tokenizer, prefix=""):
         for i, example_index in enumerate(example_indices):
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
-                # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
-            else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+
+            result = SquadResult([to_list(output[i]) for output in outputs] + [unique_id])
             all_results.append(result)
 
     evalTime = timeit.default_timer() - start_time
@@ -271,22 +264,18 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     if args.model_type in ['xlnet', 'xlm']:
         # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions_extended(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file, args.predict_file,
                         model.config.start_n_top, model.config.end_n_top,
                         args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
-        write_predictions(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, args.do_lower_case, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                         args.version_2_with_negative, args.null_score_diff_threshold)
 
-    # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
-    results = evaluate_on_squad(evaluate_options)
+    results = squad_evaluate(examples, predictions)
     return results
 
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
@@ -306,8 +295,12 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         logger.info("Creating features from dataset file at %s", input_file)
 
         processor = SquadV2Processor()
-        examples = processor.get_dev_examples("examples/squad") if evaluate else processor.get_train_examples("examples/squad")
-        features = squad_convert_examples_to_features(
+        examples = processor.get_dev_examples("examples/squad", only_first=100) if evaluate else processor.get_train_examples("examples/squad")
+        # import tensorflow_datasets as tfds
+        # tfds_examples = tfds.load("squad")
+        # examples = SquadV1Processor().get_examples_from_dataset(tfds_examples["validation"])
+
+        features = squad_convert_examples_to_features( 
             examples=examples,
             tokenizer=tokenizer,
             max_seq_length=args.max_seq_length,
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index d4c5a8ec5b..83647a20d0 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -1,15 +1,323 @@
+""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
+modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+
+
 import json
 import logging
 import math
 import collections
 from io import open
 from tqdm import tqdm
+import string
+import re
 
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 logger = logging.getLogger(__name__)
 
 
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
+    """
+    Computes the exact and f1 scores from the examples and the model predictions
+    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = ['']
+
+        if qas_id not in preds:
+            print('Missing prediction for %s' % qas_id)
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict([
+            ('exact', 100.0 * sum(exact_scores.values()) / total),
+            ('f1', 100.0 * sum(f1_scores.values()) / total),
+            ('total', total),
+        ])
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict([
+            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+            ('total', total),
+        ])
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval['best_exact'] = best_exact
+    main_eval['best_exact_thresh'] = exact_thresh
+    main_eval['best_f1'] = best_f1
+    main_eval['best_f1_thresh'] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = {k: 0.0 for k in preds}
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, 'HasAns')
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, 'NoAns')
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
 def compute_predictions(all_examples, all_features, all_results, n_best_size,
                       max_answer_length, do_lower_case, output_prediction_file,
                       output_nbest_file, output_null_log_odds_file, verbose_logging,
@@ -204,132 +512,192 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size,
     return all_predictions
 
 
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
+def compute_predictions_extended(all_examples, all_features, all_results, n_best_size,
+                                max_answer_length, output_prediction_file,
+                                output_nbest_file,
+                                output_null_log_odds_file, orig_data_file,
+                                start_n_top, end_n_top, version_2_with_negative,
+                                tokenizer, verbose_logging):
+    """ XLNet write prediction logic (more complex than Bert's).
+        Write final predictions to the json file and log-odds of null if needed.
 
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
+        Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index",
+        "start_log_prob", "end_log_prob"])
 
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+
+    logger.info("Writing predictions to: %s", output_prediction_file)
+    # logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_top_log_probs[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob))
+
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_log_prob + x.end_log_prob),
+            reverse=True)
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            # 
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+                                        verbose_logging)
+
+            if final_text in seen_predictions:
                 continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
 
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+            seen_predictions[final_text] = True
 
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_log_prob=pred.start_log_prob,
+                    end_log_prob=pred.end_log_prob))
 
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="", start_log_prob=-1e6,
+                end_log_prob=-1e6))
 
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
 
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
+        probs = _compute_softmax(total_scores)
 
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
 
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
+        assert len(nbest_json) >= 1
+        assert best_non_null_entry is not None
 
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
 
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
+        all_nbest_json[example.qas_id] = nbest_json
 
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
 
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+    with open(orig_data_file, "r", encoding='utf-8') as reader:
+        orig_data = json.load(reader)["data"]
 
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
+    qid_to_has_ans = make_qid_to_has_ans(orig_data)
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
+    out_eval = {}
 
+    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
 
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
+    return out_eval
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index afbe4270f5..70dc9faf54 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -306,13 +306,13 @@ class SquadProcessor(DataProcessor):
                     else:
                         is_impossible = False
 
-                    if not is_impossible and is_training:
-                        if (len(qa["answers"]) != 1):
-                            raise ValueError(
-                                "For training, each question should have exactly 1 answer.")
-                        answer = qa["answers"][0]
-                        answer_text = answer['text']
-                        start_position_character = answer['answer_start']
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer['text']
+                            start_position_character = answer['answer_start']
+                        else:
+                            answers = qa["answers"]
 
                     example = SquadExample(
                         qas_id=qas_id,
@@ -321,7 +321,8 @@ class SquadProcessor(DataProcessor):
                         answer_text=answer_text,
                         start_position_character=start_position_character,
                         title=title,
-                        is_impossible=is_impossible
+                        is_impossible=is_impossible,
+                        answers=answers
                     )
 
                     examples.append(example)
@@ -352,6 +353,7 @@ class SquadExample(object):
                  answer_text,
                  start_position_character,
                  title,
+                 answers=None,
                  is_impossible=False):
         self.qas_id = qas_id
         self.question_text = question_text
@@ -359,6 +361,7 @@ class SquadExample(object):
         self.answer_text = answer_text
         self.title = title
         self.is_impossible = is_impossible 
+        self.answers = answers
 
         self.start_position, self.end_position = 0, 0
 

From ecb923da9cea390742a1262327a139852c5493e9 Mon Sep 17 00:00:00 2001
From: Julien Plu <julien.plu@schibsted.com>
Date: Wed, 4 Dec 2019 09:43:15 +0100
Subject: [PATCH 16/91] Create a NER example similar to the Pytorch one. It
 takes the same options, and can be run the same way.

---
 examples/run_tf_ner.py                 | 612 +++++++++++++++++++++++++
 transformers/__init__.py               |   3 +
 transformers/modeling_tf_distilbert.py |  47 ++
 transformers/optimization_tf.py        | 254 ++++++++++
 4 files changed, 916 insertions(+)
 create mode 100644 examples/run_tf_ner.py
 create mode 100644 transformers/optimization_tf.py

diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
new file mode 100644
index 0000000000..ef1fcf6aa4
--- /dev/null
+++ b/examples/run_tf_ner.py
@@ -0,0 +1,612 @@
+# coding=utf-8
+import datetime
+import os
+import math
+import glob
+import re
+import tensorflow as tf
+import collections
+import numpy as np
+from seqeval import metrics
+import _pickle as pickle
+from absl import logging
+from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
+from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
+from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
+from transformers import create_optimizer, GradientAccumulator
+from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+from fastprogress import master_bar, progress_bar
+from absl import flags
+from absl import app
+
+
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
+    ())
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
+}
+
+
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input data dir. Should contain the .conll files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_string(
+    "model_type", None,
+    "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+
+flags.DEFINE_string(
+    "model_name_or_path", None,
+    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+flags.DEFINE_string(
+    "labels", "",
+    "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
+
+flags.DEFINE_string(
+    "config_name", "",
+    "Pretrained config name or path if not the same as model_name")
+
+flags.DEFINE_string(
+    "tokenizer_name", "",
+    "Pretrained tokenizer name or path if not the same as model_name")
+
+flags.DEFINE_string(
+    "cache_dir", "",
+    "Where do you want to store the pre-trained models downloaded from s3")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sentence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter "
+    "will be padded.")
+
+flags.DEFINE_string(
+    "tpu", None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Total number of TPU cores to use.")
+
+flags.DEFINE_boolean(
+    "do_train", False,
+    "Whether to run training.")
+
+flags.DEFINE_boolean(
+    "do_eval", False,
+    "Whether to run eval on the dev set.")
+
+flags.DEFINE_boolean(
+    "do_predict", False,
+    "Whether to run predictions on the test set.")
+
+flags.DEFINE_boolean(
+    "evaluate_during_training", False,
+    "Whether to run evaluation during training at each logging step.")
+
+flags.DEFINE_boolean(
+    "do_lower_case", False,
+    "Set this flag if you are using an uncased model.")
+
+flags.DEFINE_integer(
+    "per_device_train_batch_size", 8,
+    "Batch size per GPU/CPU/TPU for training.")
+
+flags.DEFINE_integer(
+    "per_device_eval_batch_size", 8,
+    "Batch size per GPU/CPU/TPU for evaluation.")
+
+flags.DEFINE_integer(
+    "gradient_accumulation_steps", 1,
+    "Number of updates steps to accumulate before performing a backward/update pass.")
+
+flags.DEFINE_float(
+    "learning_rate", 5e-5,
+    "The initial learning rate for Adam.")
+
+flags.DEFINE_float(
+    "weight_decay", 0.0,
+    "Weight decay if we apply some.")
+
+flags.DEFINE_float(
+    "adam_epsilon", 1e-8,
+    "Epsilon for Adam optimizer.")
+
+flags.DEFINE_float(
+    "max_grad_norm", 1.0,
+    "Max gradient norm.")
+
+flags.DEFINE_integer(
+    "num_train_epochs", 3,
+    "Total number of training epochs to perform.")
+
+flags.DEFINE_integer(
+    "max_steps", -1,
+    "If > 0: set total number of training steps to perform. Override num_train_epochs.")
+
+flags.DEFINE_integer(
+    "warmup_steps", 0,
+    "Linear warmup over warmup_steps.")
+
+flags.DEFINE_integer(
+    "logging_steps", 50,
+    "Log every X updates steps.")
+
+flags.DEFINE_integer(
+    "save_steps", 50,
+    "Save checkpoint every X updates steps.")
+
+flags.DEFINE_boolean(
+    "eval_all_checkpoints", False,
+    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+
+flags.DEFINE_boolean(
+    "no_cuda", False,
+    "Avoid using CUDA when available")
+
+flags.DEFINE_boolean(
+    "overwrite_output_dir", False,
+    "Overwrite the content of the output directory")
+
+flags.DEFINE_boolean(
+    "overwrite_cache", False,
+    "Overwrite the cached training and evaluation sets")
+
+flags.DEFINE_integer(
+    "seed", 42,
+    "random seed for initialization")
+
+flags.DEFINE_boolean(
+    "fp16", False,
+    "Whether to use 16-bit (mixed) precision instead of 32-bit")
+
+flags.DEFINE_string(
+    "gpus", "0",
+    "Comma separated list of gpus devices. If only one, switch to single "
+    "gpu strategy, if None takes all the gpus available.")
+
+
+def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
+    if args['max_steps'] > 0:
+        num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
+        args['num_train_epochs'] = 1
+    else:
+        num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
+
+    writer = tf.summary.create_file_writer("/tmp/mylogs")
+
+    with strategy.scope():
+        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+        optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
+
+        if args['fp16']:
+            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+
+        loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
+        gradient_accumulator = GradientAccumulator()
+        
+    logging.info("***** Running training *****")
+    logging.info("  Num examples = %d", num_train_examples)
+    logging.info("  Num Epochs = %d", args['num_train_epochs'])
+    logging.info("  Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
+    logging.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                train_batch_size * args['gradient_accumulation_steps'])
+    logging.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
+    logging.info("  Total training steps = %d", num_train_steps)
+
+    model.summary()
+
+    @tf.function
+    def apply_gradients():
+        grads_and_vars = []
+
+        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
+            if gradient is not None:
+                scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
+                grads_and_vars.append((scaled_gradient, variable))
+            else:
+                grads_and_vars.append((gradient, variable))
+
+        optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
+        gradient_accumulator.reset()
+
+    @tf.function
+    def train_step(train_features, train_labels):
+        def step_fn(train_features, train_labels):
+            inputs = {'attention_mask': train_features['input_mask'], 'training': True}
+
+            if args['model_type'] != "distilbert":
+                inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+
+            with tf.GradientTape() as tape:
+                logits = model(train_features['input_ids'], **inputs)[0]
+                logits = tf.reshape(logits, (-1, len(labels) + 1))
+                active_loss = tf.reshape(train_features['input_mask'], (-1,))
+                active_logits = tf.boolean_mask(logits, active_loss)
+                train_labels = tf.reshape(train_labels, (-1,))
+                active_labels = tf.boolean_mask(train_labels, active_loss)
+                cross_entropy = loss_fct(active_labels, active_logits)
+                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
+                grads = tape.gradient(loss, model.trainable_variables)
+
+                gradient_accumulator(grads)
+
+            return cross_entropy
+
+        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
+        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
+
+        return mean_loss
+
+    current_time = datetime.datetime.now()
+    train_iterator = master_bar(range(args['num_train_epochs']))
+    global_step = 0
+    logging_loss = 0.0
+
+    for epoch in train_iterator:
+        epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
+        step = 1
+
+        with strategy.scope():
+            for train_features, train_labels in epoch_iterator:
+                loss = train_step(train_features, train_labels)
+
+                if step % args['gradient_accumulation_steps'] == 0:
+                    strategy.experimental_run_v2(apply_gradients)
+
+                    loss_metric(loss)
+
+                    global_step += 1
+
+                    if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
+                        # Log metrics
+                        if args['n_device'] == 1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
+                            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+                            report = metrics.classification_report(y_true, y_pred, digits=4)
+                            
+                            logging.info("Eval at step " + str(global_step) + "\n" + report)
+                            logging.info("eval_loss: " + str(eval_loss))
+                            
+                            precision = metrics.precision_score(y_true, y_pred)
+                            recall = metrics.recall_score(y_true, y_pred)
+                            f1 = metrics.f1_score(y_true, y_pred)
+
+                            with writer.as_default():
+                                tf.summary.scalar("eval_loss", eval_loss, global_step)
+                                tf.summary.scalar("precision", precision, global_step)
+                                tf.summary.scalar("recall", recall, global_step)
+                                tf.summary.scalar("f1", f1, global_step)
+                        
+                        lr = optimizer.learning_rate
+                        learning_rate = lr(step)
+
+                        with writer.as_default():
+                            tf.summary.scalar("lr", learning_rate, global_step)
+                            tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
+                        
+                        logging_loss = loss_metric.result()
+
+                    with writer.as_default():
+                        tf.summary.scalar("loss", loss_metric.result(), step=step)
+
+                    if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
+                        # Save model checkpoint
+                        output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
+
+                        if not os.path.exists(output_dir):
+                            os.makedirs(output_dir)
+                        
+                        model.save_pretrained(output_dir)
+                        logging.info("Saving model checkpoint to %s", output_dir)
+                
+                train_iterator.child.comment = f'loss : {loss_metric.result()}'
+                step += 1
+
+        train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
+
+        loss_metric.reset_states()
+
+    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))
+
+
+def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
+    eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
+    eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
+    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
+    preds = None
+    num_eval_steps = math.ceil(size / eval_batch_size)
+    master = master_bar(range(1))
+    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
+    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+    loss = 0.0
+
+    logging.info("***** Running evaluation *****")
+    logging.info("  Num examples = %d", size)
+    logging.info("  Batch size = %d", eval_batch_size)
+
+    for eval_features, eval_labels in eval_iterator:
+        inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
+
+        if args['model_type'] != "distilbert":
+            inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+
+        with strategy.scope():
+            logits = model(eval_features['input_ids'], **inputs)[0]
+            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
+            active_loss = tf.reshape(eval_features['input_mask'], (-1,))
+            active_logits = tf.boolean_mask(tmp_logits, active_loss)
+            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
+            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
+            cross_entropy = loss_fct(active_labels, active_logits)
+            loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
+
+        if preds is None:
+            preds = logits.numpy()
+            label_ids = eval_labels.numpy()
+        else:
+            preds = np.append(preds, logits.numpy(), axis=0)
+            label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
+
+    preds = np.argmax(preds, axis=2)
+    y_pred = [[] for _ in range(label_ids.shape[0])]
+    y_true = [[] for _ in range(label_ids.shape[0])]
+    loss = loss / num_eval_steps
+
+    for i in range(label_ids.shape[0]):
+        for j in range(label_ids.shape[1]):
+            if label_ids[i, j] != pad_token_label_id:
+                y_pred[i].append(labels[preds[i, j] - 1])
+                y_true[i].append(labels[label_ids[i, j] - 1])
+
+    return y_true, y_pred, loss.numpy()
+
+
+def load_cache(cached_file, max_seq_length):
+    name_to_features = {
+        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+    }
+
+    def _decode_record(record):
+        example = tf.io.parse_single_example(record, name_to_features)
+        features = {}
+        features['input_ids'] = example['input_ids']
+        features['input_mask'] = example['input_mask']
+        features['segment_ids'] = example['segment_ids']
+
+        return features, example['label_ids']
+
+    d = tf.data.TFRecordDataset(cached_file)
+    d = d.map(_decode_record, num_parallel_calls=4)
+    count = d.reduce(0, lambda x, _: x + 1)
+
+    return d, count.numpy()
+
+
+def save_cache(features, cached_features_file):
+    writer = tf.io.TFRecordWriter(cached_features_file)
+
+    for (ex_index, feature) in enumerate(features):
+        if ex_index % 5000 == 0:
+            logging.info("Writing example %d of %d" % (ex_index, len(features)))
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        record_feature = collections.OrderedDict()
+        record_feature["input_ids"] = create_int_feature(feature.input_ids)
+        record_feature["input_mask"] = create_int_feature(feature.input_mask)
+        record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
+        record_feature["label_ids"] = create_int_feature(feature.label_ids)
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
+
+        writer.write(tf_example.SerializeToString())
+
+    writer.close()
+
+
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
+    drop_remainder = True if args['tpu'] or mode == 'train' else False
+
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
+        list(filter(None, args['model_name_or_path'].split("/"))).pop(),
+        str(args['max_seq_length'])))
+    if os.path.exists(cached_features_file) and not args['overwrite_cache']:
+        logging.info("Loading features from cached file %s", cached_features_file)
+        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+    else:
+        logging.info("Creating features from dataset file at %s", args['data_dir'])
+        examples = read_examples_from_file(args['data_dir'], mode)
+        features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
+                                                cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
+                                                # xlnet has a cls token at the end
+                                                cls_token=tokenizer.cls_token,
+                                                cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
+                                                sep_token=tokenizer.sep_token,
+                                                sep_token_extra=bool(args['model_type'] in ["roberta"]),
+                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+                                                pad_on_left=bool(args['model_type'] in ["xlnet"]),
+                                                # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
+                                                pad_token_label_id=pad_token_label_id
+                                                )
+        logging.info("Saving features into cached file %s", cached_features_file)
+        save_cache(features, cached_features_file)
+        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+
+    if mode == 'train':
+        dataset = dataset.repeat()
+        dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
+
+    dataset = dataset.batch(batch_size, drop_remainder)
+    dataset = dataset.prefetch(buffer_size=batch_size)
+
+    return dataset, size
+
+
+def main(_):
+    logging.set_verbosity(logging.INFO)
+    args = flags.FLAGS.flag_values_dict()
+
+    if os.path.exists(args['output_dir']) and os.listdir(
+            args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args['output_dir']))
+
+    if args['fp16']:
+        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
+
+    if args['tpu']:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
+        tf.config.experimental_connect_to_cluster(resolver)
+        tf.tpu.experimental.initialize_tpu_system(resolver)
+        strategy = tf.distribute.experimental.TPUStrategy(resolver)
+        args['n_device'] = args['num_tpu_cores']
+    elif len(args['gpus'].split(',')) > 1:
+        args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
+        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
+    elif args['no_cuda']:
+        args['n_device'] = 1
+        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+    else:
+        args['n_device'] = len(args['gpus'].split(','))
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
+
+    logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
+                   args['n_device'], bool(args['n_device'] > 1), args['fp16'])
+
+    labels = get_labels(args['labels'])
+    num_labels = len(labels) + 1
+    pad_token_label_id = 0
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
+    config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
+                                          num_labels=num_labels,
+                                          cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+
+    logging.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args['do_train']:
+        tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
+                                                    do_lower_case=args['do_lower_case'],
+                                                    cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+
+        with strategy.scope():
+            model = model_class.from_pretrained(args['model_name_or_path'],
+                                                from_pt=bool(".bin" in args['model_name_or_path']),
+                                                config=config,
+                                                cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+            model.layers[-1].activation = tf.keras.activations.softmax
+
+        train_batch_size = args['per_device_train_batch_size'] * args['n_device']
+        train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
+        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
+        train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
+
+        if not os.path.exists(args['output_dir']):
+            os.makedirs(args['output_dir'])
+
+        logging.info("Saving model to %s", args['output_dir'])
+
+        model.save_pretrained(args['output_dir'])
+        tokenizer.save_pretrained(args['output_dir'])
+
+    # Evaluation
+    if args['do_eval']:
+        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+        checkpoints = []
+        results = []
+
+        if args['eval_all_checkpoints']:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
+        
+        logging.info("Evaluate the following checkpoints: %s", checkpoints)
+        
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
+
+            with strategy.scope():
+                model = model_class.from_pretrained(checkpoint)
+
+            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+            report = metrics.classification_report(y_true, y_pred, digits=4)
+
+            if global_step:
+                results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
+
+        output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
+        
+        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
+            for res in results:
+                for key, val in res.items():
+                    if "loss" in key:
+                        logging.info(key + " = " + str(val))
+                        writer.write(key + " = " + str(val))
+                        writer.write("\n")
+                    else:
+                        logging.info(key)
+                        logging.info("\n" + report)
+                        writer.write(key + "\n")
+                        writer.write(report)
+                        writer.write("\n")
+
+    if args['do_predict']:
+        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+        model = model_class.from_pretrained(args['output_dir'])
+        eval_batch_size = args['per_gpu_eval_batch_size'] * args['n_device']
+        predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
+        y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
+        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
+        output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
+        report = metrics.classification_report(y_true, y_pred, digits=4)
+
+        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
+            report = metrics.classification_report(y_true, y_pred, digits=4)
+            
+            logging.info("\n" + report)
+            
+            writer.write(report)
+            writer.write("\n\nloss = " + str(pred_loss))
+
+        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
+            with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
+                example_id = 0
+
+                for line in f:
+                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                        writer.write(line)
+
+                        if not y_pred[example_id]:
+                            example_id += 1
+                    elif y_pred[example_id]:
+                        output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
+                        writer.write(output_line)
+                    else:
+                        logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("output_dir")
+    flags.mark_flag_as_required("model_name_or_path")
+    flags.mark_flag_as_required("model_type")
+    app.run(main)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 970bdf0cf1..2f74b7e79c 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -162,6 +162,7 @@ if is_tf_available():
     from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                          TFDistilBertModel, TFDistilBertForMaskedLM,
                                          TFDistilBertForSequenceClassification,
+					 TFDistilBertForTokenClassification
                                          TFDistilBertForQuestionAnswering,
                                          TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 
@@ -172,6 +173,8 @@ if is_tf_available():
     from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
                                      TFAlbertForSequenceClassification,
                                     TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    # Optimization
+    from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
 
 # TF 2.0 <=> PyTorch conversion utilities
 from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index b3d4889475..8e1aef7462 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -703,6 +703,53 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
+        tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFDistilBertForTokenClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.distilbert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # scores, (hidden_states), (attentions)
+
+
 @add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                          the hidden-states output to compute `span start logits` and `span end logits`). """,
                       DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py
new file mode 100644
index 0000000000..c5fa248083
--- /dev/null
+++ b/transformers/optimization_tf.py
@@ -0,0 +1,254 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import tensorflow as tf
+
+
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applys a warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_schedule_fn,
+      warmup_steps,
+      power=1.0,
+      name=None):
+    super(WarmUp, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.power = power
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self.name or 'WarmUp') as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = (
+          self.initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self.power))
+      return tf.cond(global_step_float < warmup_steps_float,
+                     lambda: warmup_learning_rate,
+                     lambda: self.decay_schedule_fn(step),
+                     name=name)
+
+  def get_config(self):
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'decay_schedule_fn': self.decay_schedule_fn,
+        'warmup_steps': self.warmup_steps,
+        'power': self.power,
+        'name': self.name
+    }
+
+
+def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
+  """Creates an optimizer with learning rate schedule."""
+  # Implements linear decay of the learning rate.
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+      initial_learning_rate=init_lr,
+      decay_steps=num_train_steps,
+      end_learning_rate=0.0)
+  if num_warmup_steps:
+    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
+                              decay_schedule_fn=learning_rate_fn,
+                              warmup_steps=num_warmup_steps)
+  optimizer = AdamWeightDecay(
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=['layer_norm', 'bias'])
+  return optimizer
+
+
+class AdamWeightDecay(tf.keras.optimizers.Adam):
+  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+
+  Just adding the square of the weights to the loss function is *not* the
+  correct way of using L2 regularization/weight decay with Adam, since that will
+  interact with the m and v parameters in strange ways.
+
+  Instead we want ot decay the weights in a manner that doesn't interact with
+  the m/v parameters. This is equivalent to adding the square of the weights to
+  the loss with plain (non-momentum) SGD.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               weight_decay_rate=0.0,
+               include_in_weight_decay=None,
+               exclude_from_weight_decay=None,
+               name='AdamWeightDecay',
+               **kwargs):
+    super(AdamWeightDecay, self).__init__(
+        learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+    self.weight_decay_rate = weight_decay_rate
+    self._include_in_weight_decay = include_in_weight_decay
+    self._exclude_from_weight_decay = exclude_from_weight_decay
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an optimizer from its config with WarmUp custom object."""
+    custom_objects = {'WarmUp': WarmUp}
+    return super(AdamWeightDecay, cls).from_config(
+        config, custom_objects=custom_objects)
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
+                                                apply_state)
+    apply_state['weight_decay_rate'] = tf.constant(
+        self.weight_decay_rate, name='adam_weight_decay_rate')
+
+  def _decay_weights_op(self, var, learning_rate, apply_state):
+    do_decay = self._do_use_weight_decay(var.name)
+    if do_decay:
+      return var.assign_sub(
+          learning_rate * var *
+          apply_state['weight_decay_rate'],
+          use_locking=self._use_locking)
+    return tf.no_op()
+
+  def apply_gradients(self, grads_and_vars, clip_norm, name=None):
+    grads, tvars = list(zip(*grads_and_vars))
+    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
+    return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
+
+  def _get_lr(self, var_device, var_dtype, apply_state):
+    """Retrieves the learning rate with the given state."""
+    if apply_state is None:
+      return self._decayed_lr_t[var_dtype], {}
+
+    apply_state = apply_state or {}
+    coefficients = apply_state.get((var_device, var_dtype))
+    if coefficients is None:
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+
+    return coefficients['lr_t'], dict(apply_state=apply_state)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay, self)._resource_apply_dense(
+          grad, var, **kwargs)
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay, self)._resource_apply_sparse(
+          grad, var, indices, **kwargs)
+
+  def get_config(self):
+    config = super(AdamWeightDecay, self).get_config()
+    config.update({
+        'weight_decay_rate': self.weight_decay_rate,
+    })
+    return config
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if self.weight_decay_rate == 0:
+      return False
+
+    if self._include_in_weight_decay:
+      for r in self._include_in_weight_decay:
+        if re.search(r, param_name) is not None:
+          return True
+
+    if self._exclude_from_weight_decay:
+      for r in self._exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+
+## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+class GradientAccumulator(object):
+    """Distribution strategies-aware gradient accumulation utility."""
+
+    def __init__(self):
+        """Initializes the accumulator."""
+        self._gradients = []
+        self._accum_steps = tf.Variable(
+            initial_value=0,
+            dtype=tf.int64,
+            trainable=False,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+
+    @property
+    def step(self):
+        """Number of accumulated steps."""
+        return self._accum_steps.value()
+
+    @property
+    def gradients(self):
+        """The accumulated gradients."""
+        return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
+
+    def __call__(self, gradients):
+        """Accumulates :obj:`gradients`."""
+        if not self._gradients:
+            self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
+
+        if len(gradients) != len(self._gradients):
+            raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
+
+        for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
+            if accum_gradient is not None:
+                accum_gradient.assign_add(gradient)
+
+        self._accum_steps.assign_add(1)
+
+    def reset(self):
+        """Resets the accumulated gradients."""
+        if self._gradients:
+            self._accum_steps.assign(0)
+
+        for gradient in self._get_replica_gradients():
+            if gradient is not None:
+                gradient.assign(tf.zeros_like(gradient))
+
+    def _get_replica_gradients(self):
+        if tf.distribute.has_strategy():
+            # In a replica context, we want to accumulate gradients on each replica
+            # without synchronization, so we directly assign the value of the
+            # current replica.
+            replica_context = tf.distribute.get_replica_context()
+
+            if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
+                return self._gradients
+
+            return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
+        else:
+            return self._gradients

From 254ebb979c09d2e8f7efeb11d46bd1196f856699 Mon Sep 17 00:00:00 2001
From: Julien Plu <julien.plu@schibsted.com>
Date: Wed, 4 Dec 2019 10:00:25 +0100
Subject: [PATCH 17/91] Bugfix on init file. Missing comma.

---
 transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 2f74b7e79c..e4f5984c70 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -162,7 +162,7 @@ if is_tf_available():
     from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                          TFDistilBertModel, TFDistilBertForMaskedLM,
                                          TFDistilBertForSequenceClassification,
-					 TFDistilBertForTokenClassification
+					 TFDistilBertForTokenClassification,
                                          TFDistilBertForQuestionAnswering,
                                          TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 

From 9ddc3f1a1227fc9cbe4e5a5c20b21546e438dfb1 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 10:37:00 -0500
Subject: [PATCH 18/91] Naming update + XLNet/XLM evaluation

---
 examples/run_squad.py                      |  6 +-
 transformers/data/metrics/squad_metrics.py | 97 ++++++++++++++++++----
 2 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index b7952487dc..a9ef5c6ba2 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -17,7 +17,7 @@
 
 from __future__ import absolute_import, division, print_function
 from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
-from transformers.data.metrics.squad_metrics import compute_predictions, compute_predictions_extended, squad_evaluate
+from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
 
 import argparse
 import logging
@@ -264,13 +264,13 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     if args.model_type in ['xlnet', 'xlm']:
         # XLNet uses a more complex post-processing procedure
-        predictions = compute_predictions_extended(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file, args.predict_file,
                         model.config.start_n_top, model.config.end_n_top,
                         args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
-        predictions = compute_predictions(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, args.do_lower_case, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                         args.version_2_with_negative, args.null_score_diff_threshold)
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index 83647a20d0..1f120d354a 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -125,6 +125,53 @@ def merge_eval(main_eval, new_eval, prefix):
         main_eval['%s_%s' % (prefix, k)] = new_eval[k]
 
 
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
+        preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
+        preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval['best_exact'] = best_exact
+    main_eval['best_exact_thresh'] = exact_thresh
+    main_eval['best_f1'] = best_f1
+    main_eval['best_f1_thresh'] = f1_thresh
+    main_eval['has_ans_exact'] = has_ans_exact
+    main_eval['has_ans_f1'] = has_ans_f1
+
+
 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
     num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
     cur_score = num_no_ans
@@ -318,10 +365,20 @@ def _compute_softmax(scores):
     return probs
 
 
-def compute_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold
+):
     """Write final predictions to the json file and log-odds of null if needed."""
     logger.info("Writing predictions to: %s" % (output_prediction_file))
     logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -450,12 +507,12 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size,
                         text="",
                         start_logit=null_start_logit,
                         end_logit=null_end_logit))
-                
+
             # In very rare edge cases we could only have single null prediction.
             # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
+            if len(nbest) == 1:
                 nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                             _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
@@ -512,12 +569,22 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size,
     return all_predictions
 
 
-def compute_predictions_extended(all_examples, all_features, all_results, n_best_size,
-                                max_answer_length, output_prediction_file,
-                                output_nbest_file,
-                                output_null_log_odds_file, orig_data_file,
-                                start_n_top, end_n_top, version_2_with_negative,
-                                tokenizer, verbose_logging):
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    orig_data_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging
+):
     """ XLNet write prediction logic (more complex than Bert's).
         Write final predictions to the json file and log-odds of null if needed.
 
@@ -526,7 +593,7 @@ def compute_predictions_extended(all_examples, all_features, all_results, n_best
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
         "PrelimPrediction",
         ["feature_index", "start_index", "end_index",
-        "start_log_prob", "end_log_prob"])
+         "start_log_prob", "end_log_prob"])
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
         "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
@@ -609,7 +676,7 @@ def compute_predictions_extended(all_examples, all_features, all_results, n_best
 
             # XLNet un-tokenizer
             # Let's keep it simple for now and see if we need all this later.
-            # 
+            #
             # tok_start_to_orig_index = feature.tok_start_to_orig_index
             # tok_end_to_orig_index = feature.tok_end_to_orig_index
             # start_orig_pos = tok_start_to_orig_index[pred.start_index]

From ff98b041da4b992a87d8b6258b30e47310ec8430 Mon Sep 17 00:00:00 2001
From: Julien Plu <julien.plu@schibsted.com>
Date: Wed, 4 Dec 2019 16:53:06 +0100
Subject: [PATCH 19/91] Fix whitespace issue

---
 transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index e4f5984c70..6d18f11722 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -162,7 +162,7 @@ if is_tf_available():
     from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                          TFDistilBertModel, TFDistilBertForMaskedLM,
                                          TFDistilBertForSequenceClassification,
-					 TFDistilBertForTokenClassification,
+                                         TFDistilBertForTokenClassification,
                                          TFDistilBertForQuestionAnswering,
                                          TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 

From bf119c0568dfc1ea5ce0a34359e33ca002266e96 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 11:34:59 -0500
Subject: [PATCH 20/91] TFDS dataset can now be evaluated

---
 transformers/data/processors/squad.py | 34 ++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 70dc9faf54..2e50ac8a8c 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -245,22 +245,37 @@ class SquadProcessor(DataProcessor):
     train_file = None
     dev_file = None
 
-    def get_example_from_tensor_dict(self, tensor_dict):
+    def get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+
+        if not evaluate:
+            answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
+            answer_start = tensor_dict['answers']['answer_start'][0].numpy()
+            answers = None
+        else:
+            answers = [{
+                "answer_start": start.numpy(), 
+                "text": text.numpy().decode('utf-8')
+            } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
+
+            answer = None
+            answer_start = None
+
         return SquadExample(
-            tensor_dict['id'].numpy().decode("utf-8"),
-            tensor_dict['question'].numpy().decode('utf-8'),
-            tensor_dict['context'].numpy().decode('utf-8'),
-            tensor_dict['answers']['text'][0].numpy().decode('utf-8'),
-            tensor_dict['answers']['answer_start'][0].numpy(),
-            tensor_dict['title'].numpy().decode('utf-8')
+            qas_id=tensor_dict['id'].numpy().decode("utf-8"),
+            question_text=tensor_dict['question'].numpy().decode('utf-8'),
+            context_text=tensor_dict['context'].numpy().decode('utf-8'),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict['title'].numpy().decode('utf-8'),
+            answers=answers
         )
 
-    def get_examples_from_dataset(self, dataset):
+    def get_examples_from_dataset(self, dataset, evaluate=False):
         """See base class."""
 
         examples = []
         for tensor_dict in tqdm(dataset):
-            examples.append(self.get_example_from_tensor_dict(tensor_dict)) 
+            examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
 
         return examples
 
@@ -300,6 +315,7 @@ class SquadProcessor(DataProcessor):
                     question_text = qa["question"]
                     start_position_character = None
                     answer_text = None
+                    answers = None
                     
                     if "is_impossible" in qa:
                         is_impossible = qa["is_impossible"]

From cca75e788485e8a2a1c44a445c6aba0fb2dfaf56 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 15:42:29 -0500
Subject: [PATCH 21/91] Kill the demon spawn

---
 examples/run_squad.py                 | 23 +++++++-
 transformers/data/processors/squad.py | 75 +++++----------------------
 2 files changed, 34 insertions(+), 64 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index a9ef5c6ba2..2f86322196 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -248,7 +248,28 @@ def evaluate(args, model, tokenizer, prefix=""):
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
 
-            result = SquadResult([to_list(output[i]) for output in outputs] + [unique_id])
+            output = [to_list(output[i]) for output in outputs]
+
+            if len(output) >= 5:
+                start_logits = output[0]
+                start_top_index = output[1]
+                end_logits = output[2]
+                end_top_index = output[3],
+                cls_logits = output[4]
+
+                result = SquadResult(
+                    unique_id, start_logits, end_logits, 
+                    start_top_index=start_top_index, 
+                    end_top_index=end_top_index, 
+                    cls_logits=cls_logits
+                )
+
+            else:
+                start_logits, end_logits = output
+                result = SquadResult(
+                    unique_id, start_logits, end_logits
+                )
+
             all_results.append(result)
 
     evalTime = timeit.default_timer() - start_time
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 2e50ac8a8c..9306189eb4 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -446,72 +446,21 @@ class SquadFeatures(object):
         self.end_position = end_position
 
 
-
 class SquadResult(object):
     """
     Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
 
     Args:
-        result: The result output by a model on a SQuAD inference. These results may be complex (5 values) as the ones output by
-            XLNet or XLM or may be simple like the other models (2 values). They may be passed as a list or as a dict, with the 
-            following accepted formats:
-
-            `dict` output by a simple model:
-                {
-                    "start_logits": int,
-                    "end_logits": int,
-                    "unique_id": string
-                }
-            `list` output by a simple model:
-                [start_logits, end_logits, unique_id]
-
-            `dict` output by a complex model:
-                {
-                    "start_top_log_probs": float,
-                    "start_top_index": int,
-                    "end_top_log_probs": float,
-                    "end_top_index": int,
-                    "cls_logits": int,
-                    "unique_id": string
-                }
-            `list` output by a complex model:
-                [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, unique_id]
-
-            See `run_squad.py` for an example.
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
     """
-    def __init__(self, result):
-        if isinstance(result, dict):
-            if "start_logits" in result and "end_logits" in result:
-                self.start_logits = result["start_logits"]
-                self.end_logits = result["end_logits"]
-
-            elif "start_top_log_probs" in result and "start_top_index" in result:
-                self.start_top_log_probs  = result["start_top_log_probs"]
-                self.start_top_index      = result["start_top_index"]
-                self.end_top_log_probs    = result["end_top_log_probs"]
-                self.end_top_index        = result["end_top_index"]
-                self.cls_logits           = result["cls_logits"]
-
-            else:
-                raise ValueError("SquadResult instantiated with wrong values.")
-
-            self.unique_id = result["unique_id"]
-        elif isinstance(result, list):            
-            if len(result) == 3:
-                self.start_logits = result[0]
-                self.end_logits = result[1]
-
-            elif len(result) == 6:
-                self.start_top_log_probs  = result[0]
-                self.start_top_index      = result[1]
-                self.end_top_log_probs    = result[2]
-                self.end_top_index        = result[3]
-                self.cls_logits           = result[4]
-
-            else:
-                raise ValueError("SquadResult instantiated with wrong values.")
-
-            self.unique_id = result[-1]
-
-        else:
-            raise ValueError("SquadResult instantiated with wrong values. Should be a dictionary or a list.")
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_top_log_probs = start_logits
+        self.end_top_log_probs = end_logits
+        self.unique_id = unique_id
+        
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
\ No newline at end of file

From a7ca6d738b7801c680bd25d9e910f962d3f8bf2d Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 15:43:34 -0500
Subject: [PATCH 22/91] Padding side is  tokenizer-dependant

---
 transformers/data/processors/squad.py         | 11 ++--
 .../tests/tokenization_tests_commons.py       | 21 +++++--
 transformers/tokenization_utils.py            | 60 ++++++++++++-------
 transformers/tokenization_xlnet.py            |  1 +
 4 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 9306189eb4..6599c54330 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -73,8 +73,7 @@ def _is_whitespace(c):
     return False
 
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training,
-                                       sequence_a_is_doc=False):
+                                       doc_stride, max_query_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
 
     # Defining helper methods    
@@ -127,13 +126,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         while len(spans) * doc_stride < len(all_doc_tokens):
             
             encoded_dict = tokenizer.encode_plus(
-                truncated_query if not sequence_a_is_doc else span_doc_tokens, 
-                span_doc_tokens if not sequence_a_is_doc else truncated_query, 
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, 
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, 
                 max_length=max_seq_length, 
                 return_overflowing_tokens=True, 
-                padding_strategy='right',
+                pad_to_max_length=True,
                 stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first'
+                truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
             )
 
             paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index 40d68d0ab2..6592005c67 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -344,17 +344,19 @@ class CommonTestCases:
             padding_idx = tokenizer.pad_token_id
 
             # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+            tokenizer.padding_side = "right"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right')
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
 
             # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+            tokenizer.padding_side = "left"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left')
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
@@ -362,10 +364,15 @@ class CommonTestCases:
             # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right')
+
+            tokenizer.padding_side = "right"
+            padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
             padded_sequence_right_length = len(padded_sequence_right)
-            padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left')
+
+            tokenizer.padding_side = "left"
+            padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
             padded_sequence_left_length = len(padded_sequence_left)
+
             assert sequence_length == padded_sequence_right_length
             assert encoded_sequence == padded_sequence_right
             assert sequence_length == padded_sequence_left_length
@@ -387,7 +394,8 @@ class CommonTestCases:
             sequence_length = len(input_ids)
 
             # Test right padding
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True)
+            tokenizer.padding_side = "right"
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
             padded_input_ids = padded_sequence['input_ids']
             padded_token_type_ids = padded_sequence['token_type_ids']
             padded_attention_mask = padded_sequence['attention_mask']
@@ -401,7 +409,8 @@ class CommonTestCases:
             assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
 
             # Test left padding
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True)
+            tokenizer.padding_side = "left"
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
             padded_input_ids = padded_sequence['input_ids']
             padded_token_type_ids = padded_sequence['token_type_ids']
             padded_attention_mask = padded_sequence['attention_mask']
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index dbbabd0e1a..41a611ea49 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -77,6 +77,8 @@ class PreTrainedTokenizer(object):
                                  "pad_token", "cls_token", "mask_token",
                                  "additional_special_tokens"]
 
+    padding_side = "right"
+
     @property
     def bos_token(self):
         """ Beginning of sentence token (string). Log an error if used while not having been set. """
@@ -223,6 +225,9 @@ class PreTrainedTokenizer(object):
 
         self.max_len = max_len if max_len is not None else int(1e12)
 
+        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        self.padding_side = kwargs.pop('padding_side', self.padding_side)
+        
         # Added tokens
         self.added_tokens_encoder = {}
         self.added_tokens_decoder = {}
@@ -702,7 +707,7 @@ class PreTrainedTokenizer(object):
                max_length=None,
                stride=0,
                truncation_strategy='longest_first',
-               padding_strategy=None,
+               pad_to_max_length=False,
                return_tensors=None,
                **kwargs):
         """
@@ -729,12 +734,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
-                padding index, up to their max length. If no max length is specified, no padding is done.
-                The strategies are handled by the following strings:
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences   
-                Defaults to None: no padding.
+                Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -745,7 +750,7 @@ class PreTrainedTokenizer(object):
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
                                           truncation_strategy=truncation_strategy,
-                                          padding_strategy=padding_strategy,
+                                          pad_to_max_length=pad_to_max_length,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -758,7 +763,7 @@ class PreTrainedTokenizer(object):
                     max_length=None,
                     stride=0,
                     truncation_strategy='longest_first',
-                    padding_strategy=None,
+                    pad_to_max_length=False,
                     return_tensors=None,
                     return_token_type_ids=True,
                     return_attention_mask=True,
@@ -788,12 +793,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
-                padding index, up to their max length. If no max length is specified, no padding is done.
-                The strategies are handled by the following strings:
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences   
-                Defaults to None: no padding.
+                Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
@@ -841,7 +846,7 @@ class PreTrainedTokenizer(object):
         return self.prepare_for_model(first_ids,
                                       pair_ids=second_ids,
                                       max_length=max_length,
-                                      padding_strategy=padding_strategy,
+                                      pad_to_max_length=pad_to_max_length,
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncation_strategy=truncation_strategy,
@@ -853,7 +858,7 @@ class PreTrainedTokenizer(object):
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
                           truncation_strategy='longest_first',
-                          padding_strategy=None,
+                          pad_to_max_length=False,
                           return_tensors=None,
                           return_token_type_ids=True,
                           return_attention_mask=True,
@@ -881,12 +886,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
-                padding index, up to their max length. If no max length is specified, no padding is done.
-                The strategies are handled by the following strings:
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences            
-                Defaults to None: no padding.
+                - 'right': pads on the right of the sequences   
+                Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
@@ -955,10 +960,19 @@ class PreTrainedTokenizer(object):
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
                            
-        if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length:
-            difference = max_length - len(encoded_inputs["input_ids"])
+        needs_to_be_padded = pad_to_max_length and (
+            max_length and len(encoded_inputs["input_ids"]) < max_length
+            or 
+            max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
+        )
 
-            if padding_strategy == 'right':
+        if pad_to_max_length and max_length is None and self.max_len > 10000:
+            logger.warning("Sequence can't be padded as the maximum  ")
+
+        if needs_to_be_padded:
+            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
+
+            if self.padding_side == 'right':
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
                 if return_token_type_ids:
@@ -967,7 +981,7 @@ class PreTrainedTokenizer(object):
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
 
-            elif padding_strategy == 'left':
+            elif self.padding_side == 'left':
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
@@ -977,7 +991,7 @@ class PreTrainedTokenizer(object):
                 encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
 
             else:
-                raise ValueError("Invalid padding strategy:" + str(padding_strategy))
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
             
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 3ea71f4438..1c43c0943a 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    padding_side = "left"
 
     def __init__(self, vocab_file,
                  do_lower_case=False, remove_space=True, keep_accents=False,

From f7e4a7cdfa6bcf6ec7c33fd1d40d307278b1c13a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 16:24:15 -0500
Subject: [PATCH 23/91] Cleanup

---
 examples/run_squad.py                         |  32 ++--
 examples/test_examples.py                     |   3 +-
 .../{dev-v2.0-small.json => dev-v2.0.json}    |   0
 examples/tests_samples/SQUAD/train-v2.0.json  | 140 ++++++++++++++++++
 transformers/data/metrics/squad_metrics.py    |   4 +-
 transformers/data/processors/squad.py         |  36 ++++-
 6 files changed, 191 insertions(+), 24 deletions(-)
 rename examples/tests_samples/SQUAD/{dev-v2.0-small.json => dev-v2.0.json} (100%)
 create mode 100644 examples/tests_samples/SQUAD/train-v2.0.json

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 2f86322196..3f1b6a798f 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -304,8 +304,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+    input_dir = args.data_dir if args.data_dir else "."
+    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
         'dev' if evaluate else 'train',
         list(filter(None, args.model_name_or_path.split('/'))).pop(),
         str(args.max_seq_length)))
@@ -313,13 +313,22 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
-        logger.info("Creating features from dataset file at %s", input_file)
+        logger.info("Creating features from dataset file at %s", input_dir)
 
-        processor = SquadV2Processor()
-        examples = processor.get_dev_examples("examples/squad", only_first=100) if evaluate else processor.get_train_examples("examples/squad")
-        # import tensorflow_datasets as tfds
-        # tfds_examples = tfds.load("squad")
-        # examples = SquadV1Processor().get_examples_from_dataset(tfds_examples["validation"])
+        if not args.data_dir:
+            try:
+                import tensorflow_datasets as tfds
+            except ImportError:
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
+
+            if args.version_2_with_negative:
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+
+            tfds_examples = tfds.load("squad")
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+        else:
+            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+            examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
 
         features = squad_convert_examples_to_features( 
             examples=examples,
@@ -328,7 +337,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             doc_stride=args.doc_stride,
             max_query_length=args.max_query_length,
             is_training=not evaluate,
-            sequence_a_is_doc=True if args.model_type in ['xlnet'] else False
         )
 
 
@@ -365,10 +373,6 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
     parser.add_argument("--model_type", default=None, type=str, required=True,
                         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
@@ -377,6 +381,8 @@ def main():
                         help="The output directory where the model checkpoints and predictions will be written.")
 
     ## Other parameters
+    parser.add_argument("--data_dir", default=None, type=str,
+                        help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.")
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
diff --git a/examples/test_examples.py b/examples/test_examples.py
index b04d722b7b..632d2f728e 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
         logger.addHandler(stream_handler)
 
         testargs = ["run_squad.py",
-                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
-                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--data_dir=./examples/tests_samples/SQUAD",
                     "--model_name=bert-base-uncased",
                     "--output_dir=./examples/tests_samples/temp_dir",
                     "--max_steps=10",
diff --git a/examples/tests_samples/SQUAD/dev-v2.0-small.json b/examples/tests_samples/SQUAD/dev-v2.0.json
similarity index 100%
rename from examples/tests_samples/SQUAD/dev-v2.0-small.json
rename to examples/tests_samples/SQUAD/dev-v2.0.json
diff --git a/examples/tests_samples/SQUAD/train-v2.0.json b/examples/tests_samples/SQUAD/train-v2.0.json
new file mode 100644
index 0000000000..834d9ee660
--- /dev/null
+++ b/examples/tests_samples/SQUAD/train-v2.0.json
@@ -0,0 +1,140 @@
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
\ No newline at end of file
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index 1f120d354a..f8449df045 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -630,12 +630,12 @@ def compute_predictions_log_probs(
 
             for i in range(start_n_top):
                 for j in range(end_n_top):
-                    start_log_prob = result.start_top_log_probs[i]
+                    start_log_prob = result.start_logits[i]
                     start_index = result.start_top_index[i]
 
                     j_index = i * end_n_top + j
 
-                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_log_prob = result.end_logits[j_index]
                     end_index = result.end_top_index[j_index]
 
                     # We could hypothetically create invalid predictions, e.g., predict
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 6599c54330..dd2d9d25c0 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -146,7 +146,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             token_to_orig_map = {}
             for i in range(paragraph_len):
-                index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i 
+                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i 
                 token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
 
             encoded_dict["paragraph_len"] = paragraph_len
@@ -166,7 +166,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         for doc_span_index in range(len(spans)):
             for j in range(spans[doc_span_index]["paragraph_len"]):
                 is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-                index = j if sequence_a_is_doc else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
                 spans[doc_span_index]["token_is_max_context"][index] = is_max_context
 
         for span in spans:
@@ -179,7 +179,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             p_mask = np.minimum(p_mask, 1)
 
-            if not sequence_a_is_doc:
+            if tokenizer.padding_side == "right":
                 # Limit positive values to one
                 p_mask = 1 - p_mask
 
@@ -207,7 +207,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                     end_position = cls_index
                     span_is_impossible = True
                 else:
-                    if sequence_a_is_doc:
+                    if tokenizer.padding_side == "left":
                         doc_offset = 0
                     else:
                         doc_offset = len(truncated_query) + sequence_added_tokens
@@ -270,7 +270,29 @@ class SquadProcessor(DataProcessor):
         )
 
     def get_examples_from_dataset(self, dataset, evaluate=False):
-        """See base class."""
+        """
+        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+            evaluate: boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples::
+
+            import tensorflow_datasets as tfds
+            dataset = tfds.load("squad")
+
+            training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        """
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
 
         examples = []
         for tensor_dict in tqdm(dataset):
@@ -455,8 +477,8 @@ class SquadResult(object):
         end_logits: The logits corresponding to the end of the answer
     """
     def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
-        self.start_top_log_probs = start_logits
-        self.end_top_log_probs = end_logits
+        self.start_logits = start_logits
+        self.end_logits = end_logits
         self.unique_id = unique_id
         
         if start_top_index:

From 33508ae310f101a2534d3e97ea23fda93e25ef38 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 16:26:45 -0500
Subject: [PATCH 24/91] Remove `only_first`

---
 transformers/data/processors/squad.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index dd2d9d25c0..09a79db471 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -300,29 +300,29 @@ class SquadProcessor(DataProcessor):
 
         return examples
 
-    def get_train_examples(self, data_dir, only_first=None):
+    def get_train_examples(self, data_dir):
         """See base class."""
         if self.train_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
 
         with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "train", only_first)
+        return self._create_examples(input_data, "train")
 
-    def get_dev_examples(self, data_dir, only_first=None):
+    def get_dev_examples(self, data_dir):
         """See base class."""
         if self.dev_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
         
         with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "dev", only_first)
+        return self._create_examples(input_data, "dev")
 
     def get_labels(self):
         """See base class."""
         return ["0", "1"]
 
-    def _create_examples(self, input_data, set_type, only_first=None):
+    def _create_examples(self, input_data, set_type):
         """Creates examples for the training and dev sets."""
         
         is_training = set_type == "train"
@@ -363,9 +363,6 @@ class SquadProcessor(DataProcessor):
                     )
 
                     examples.append(example)
-
-                    if only_first is not None and len(examples) > only_first:
-                        return examples
         return examples
 
 class SquadV1Processor(SquadProcessor):

From 7a03519975e4f0b6698bf1221c2263ed0f8d795c Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 17:24:35 -0500
Subject: [PATCH 25/91] Documentation

---
 docs/source/main_classes/processors.rst |  79 +++++++++++++++++-
 transformers/data/processors/squad.py   | 104 ++++++++++++++++++++----
 2 files changed, 164 insertions(+), 19 deletions(-)

diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst
index a85c126956..ce0eeb553a 100644
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -55,4 +55,81 @@ Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 An example using these processors is given in the
-`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
\ No newline at end of file
+`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+
+
+
+SQuAD
+~~~~~~~~~~~~~~~~~~~~~
+
+`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
+the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
+`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside 
+the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
+
+This library hosts a processor for each of the two versions:
+
+Processors
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Those processors are:
+    - :class:`~transformers.data.processors.utils.SquadV1Processor`
+    - :class:`~transformers.data.processors.utils.SquadV2Processor`
+
+They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
+
+.. autoclass:: transformers.data.processors.squad.SquadProcessor
+    :members:
+
+Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
+that can be used as model inputs.
+
+.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
+
+These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
+Examples are given below.
+
+Example usage
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Here is an example using the processors as well as the conversion method using data files:
+
+Example::
+
+    # Loading a V2 processor
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(squad_v2_data_dir)
+
+    # Loading a V1 processor
+    processor = SquadV1Processor()
+    examples = processor.get_dev_examples(squad_v1_data_dir)
+
+    features = squad_convert_examples_to_features( 
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=max_query_length,
+        is_training=not evaluate,
+    )
+
+Using `tensorflow_datasets` is as easy as using a data file:
+
+Example::
+
+    # tensorflow_datasets only handle Squad V1.
+    tfds_examples = tfds.load("squad")
+    examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+    features = squad_convert_examples_to_features( 
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=max_query_length,
+        is_training=not evaluate,
+    )
+
+
+Another example using these processors is given in the
+`run_squad.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 09a79db471..b17e626c98 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -74,7 +74,35 @@ def _is_whitespace(c):
 
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                                        doc_stride, max_query_length, is_training):
-    """Loads a data file into a list of `InputBatch`s."""
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model.
+    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: wheter to create features for model evaluation or model training.
+
+    Returns:
+        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+
+    Example::
+
+        processor = SquadV2Processor()
+        examples = processor.get_dev_examples(data_dir)
+
+        features = squad_convert_examples_to_features( 
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
+    """
 
     # Defining helper methods    
     unique_id = 1000000000
@@ -240,12 +268,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
 
 class SquadProcessor(DataProcessor):
-    """Processor for the SQuAD data set."""
+    """
+    Processor for the SQuAD data set.
+    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
+    """
     train_file = None
     dev_file = None
 
-    def get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
-
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
         if not evaluate:
             answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
             answer_start = tensor_dict['answers']['answer_start'][0].numpy()
@@ -296,35 +326,44 @@ class SquadProcessor(DataProcessor):
 
         examples = []
         for tensor_dict in tqdm(dataset):
-            examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
 
         return examples
 
-    def get_train_examples(self, data_dir):
-        """See base class."""
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
         if self.train_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
 
-        with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader:
+        with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "train")
 
-    def get_dev_examples(self, data_dir):
-        """See base class."""
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
         if self.dev_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
         
-        with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader:
+        with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev")
 
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
     def _create_examples(self, input_data, set_type):
-        """Creates examples for the training and dev sets."""
-        
         is_training = set_type == "train"
         examples = []
         for entry in tqdm(input_data):
@@ -378,6 +417,16 @@ class SquadV2Processor(SquadProcessor):
 class SquadExample(object):
     """
     A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
     """
 
     def __init__(self,
@@ -427,7 +476,26 @@ class SquadExample(object):
 class SquadFeatures(object):
     """
     Single squad example features to be fed to a model.
-    Those features are model-specific.
+    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
+    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
+            If a token does not have their maximum context in this feature object, it means that another feature object
+            has more information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index 
+        end_position: end of the answer token index 
     """
 
     def __init__(self,

From ce158a076f7089bf11d44e1581f5bcab4dcc5396 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 17:55:52 -0500
Subject: [PATCH 26/91] Return dataset (pytorch)

---
 transformers/data/processors/squad.py | 41 ++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index b17e626c98..338bae0c51 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -7,7 +7,11 @@ import numpy as np
 
 from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
 from .utils import DataProcessor, InputExample, InputFeatures
-from ...file_utils import is_tf_available
+from ...file_utils import is_tf_available, is_torch_available
+
+if is_torch_available:
+    import torch
+    from torch.utils.data import TensorDataset
 
 if is_tf_available():
     import tensorflow as tf
@@ -73,7 +77,8 @@ def _is_whitespace(c):
     return False
 
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training):
+                                       doc_stride, max_query_length, is_training, 
+                                       return_dataset=False):
     """
     Converts a list of examples into a list of features that can be directly given as input to a model.
     It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
@@ -84,7 +89,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         max_seq_length: The maximum sequence length of the inputs.
         doc_stride: The stride used when the context is too large and is split across several features.
         max_query_length: The maximum length of the query.
-        is_training: wheter to create features for model evaluation or model training.
+        is_training: whether to create features for model evaluation or model training.
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset,
+            if 'tf': returns a tf.data.Dataset
 
     Returns:
         list of :class:`~transformers.data.processors.squad.SquadFeatures`
@@ -264,6 +272,31 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             unique_id += 1
 
+    if return_dataset == 'pt':
+        if not is_torch_available():
+            raise ImportError("Pytorch must be installed to return a pytorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                    all_example_index, all_cls_index, all_p_mask)
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                    all_start_positions, all_end_positions,
+                                    all_cls_index, all_p_mask)
+
+        return features, dataset
+        
+
     return features
 
 
@@ -359,7 +392,7 @@ class SquadProcessor(DataProcessor):
         if self.dev_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
         
-        with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader:
+        with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev")
 

From 9200a759d782a87530765fb32f52b6248c7f4d03 Mon Sep 17 00:00:00 2001
From: Julien Plu <julien.plu@schibsted.com>
Date: Thu, 5 Dec 2019 12:56:43 +0100
Subject: [PATCH 27/91] Add few tests on the TF optimization file with some
 info in the documentation. Complete the README.

---
 .../main_classes/optimizer_schedules.rst      | 24 +++++
 examples/README.md                            | 77 +++++++++++++++-
 examples/run_tf_ner.py                        |  7 +-
 transformers/tests/optimization_tf_test.py    | 89 +++++++++++++++++++
 4 files changed, 191 insertions(+), 6 deletions(-)
 create mode 100644 transformers/tests/optimization_tf_test.py

diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
index b30a2e0e2e..22ed1b28fb 100644
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -5,6 +5,7 @@ The ``.optimization`` module provides:
 
 - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
+- a gradient accumulation class to accumulate the gradients of multiple batches
 
 ``AdamW``
 ~~~~~~~~~~~~~~~~
@@ -12,6 +13,15 @@ The ``.optimization`` module provides:
 .. autoclass:: transformers.AdamW
     :members:
 
+``AdamWeightDecay``
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AdamWeightDecay
+    :members:
+
+.. autofunction:: transformers.create_optimizer
+    :members:
+
 Schedules
 ----------------------------------------------------
 
@@ -49,3 +59,17 @@ Learning Rate Schedules
 .. image:: /imgs/warmup_linear_schedule.png
     :target: /imgs/warmup_linear_schedule.png
     :alt:
+
+``Warmup``
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Warmup
+    :members:
+
+Gradient Strategies
+----------------------------------------------------
+
+``GradientAccumulator``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GradientAccumulator
diff --git a/examples/README.md b/examples/README.md
index 960b218f11..2dd6653916 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -465,7 +465,8 @@ Training with the previously defined hyper-parameters yields the following resul
 
 ## Named Entity Recognition
 
-Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py).
+Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
+[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
 This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
 Details and results for the fine-tuning provided by @stefan-it.
 
@@ -510,7 +511,7 @@ The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so
 cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
 ```
 
-### Training
+### Prepare the run
 
 Additional environment variables must be set:
 
@@ -522,6 +523,8 @@ export SAVE_STEPS=750
 export SEED=1
 ```
 
+### Run the Pytorch version
+
 To start training, just run:
 
 ```bash
@@ -542,7 +545,7 @@ python3 run_ner.py --data_dir ./ \
 
 If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
 
-### Evaluation
+#### Evaluation
 
 Evaluation on development dataset outputs the following for our example:
 
@@ -564,7 +567,7 @@ On the test dataset the following results could be achieved:
 10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
 ```
 
-### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
+#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
 
 Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
 
@@ -574,6 +577,72 @@ Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) a
 | `roberta-large`                  | 95.96 | 91.87
 | `distilbert-base-uncased` | 94.34 | 90.32
 
+### Run the Tensorflow 2 version
+
+To start training, just run:
+
+```bash
+python3 run_tf_ner.py --data_dir ./ \
+--model_type bert \
+--labels ./labels.txt \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--per_device_train_batch_size $BATCH_SIZE \
+--save_steps $SAVE_STEPS \
+--seed $SEED \
+--do_train \
+--do_eval \
+--do_predict
+```
+
+Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
+
+#### Evaluation
+
+Evaluation on development dataset outputs the following for our example:
+```bash
+           precision    recall  f1-score   support
+
+ LOCderiv     0.7619    0.6154    0.6809        52
+  PERpart     0.8724    0.8997    0.8858      4057
+  OTHpart     0.9360    0.9466    0.9413       711
+  ORGpart     0.7015    0.6989    0.7002       269
+  LOCpart     0.7668    0.8488    0.8057       496
+      LOC     0.8745    0.9191    0.8963       235
+ ORGderiv     0.7723    0.8571    0.8125        91
+ OTHderiv     0.4800    0.6667    0.5581        18
+      OTH     0.5789    0.6875    0.6286        16
+ PERderiv     0.5385    0.3889    0.4516        18
+      PER     0.5000    0.5000    0.5000         2
+      ORG     0.0000    0.0000    0.0000         3
+
+micro avg     0.8574    0.8862    0.8715      5968
+macro avg     0.8575    0.8862    0.8713      5968
+```
+
+On the test dataset the following results could be achieved:
+```bash
+           precision    recall  f1-score   support
+
+  PERpart     0.8847    0.8944    0.8896      9397
+  OTHpart     0.9376    0.9353    0.9365      1639
+  ORGpart     0.7307    0.7044    0.7173       697
+      LOC     0.9133    0.9394    0.9262       561
+  LOCpart     0.8058    0.8157    0.8107      1150
+      ORG     0.0000    0.0000    0.0000         8
+ OTHderiv     0.5882    0.4762    0.5263        42
+ PERderiv     0.6571    0.5227    0.5823        44
+      OTH     0.4906    0.6667    0.5652        39
+ ORGderiv     0.7016    0.7791    0.7383       172
+ LOCderiv     0.8256    0.6514    0.7282       109
+      PER     0.0000    0.0000    0.0000        11
+
+micro avg     0.8722    0.8774    0.8748     13869
+macro avg     0.8712    0.8774    0.8740     13869
+```
+
 ## Abstractive summarization
 
 Based on the script
diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
index ef1fcf6aa4..eb284f4c2a 100644
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -540,6 +540,9 @@ def main(_):
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
         
         logging.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        if len(checkpoints) == 0:
+            checkpoints.append(args['output_dir'])
         
         for checkpoint in checkpoints:
             global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
@@ -572,10 +575,10 @@ def main(_):
     if args['do_predict']:
         tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
         model = model_class.from_pretrained(args['output_dir'])
-        eval_batch_size = args['per_gpu_eval_batch_size'] * args['n_device']
+        eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
         predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
         y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
-        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
+        output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
         output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
         report = metrics.classification_report(y_true, y_pred, digits=4)
 
diff --git a/transformers/tests/optimization_tf_test.py b/transformers/tests/optimization_tf_test.py
new file mode 100644
index 0000000000..ac5109cb56
--- /dev/null
+++ b/transformers/tests/optimization_tf_test.py
@@ -0,0 +1,89 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import pytest
+
+from transformers import is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from tensorflow.python.eager import context
+    from tensorflow.python.framework import ops
+    from transformers import (create_optimizer, GradientAccumulator)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+class OptimizationFTest(unittest.TestCase):
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
+    
+    def testGradientAccumulator(self):
+        accumulator = GradientAccumulator()
+        accumulator([tf.constant([1.0, 2.0])])
+        accumulator([tf.constant([-2.0, 1.0])])
+        accumulator([tf.constant([-1.0, 2.0])])
+        with self.assertRaises(ValueError):
+            accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
+        self.assertEqual(accumulator.step, 3)
+        self.assertEqual(len(accumulator.gradients), 1)
+        self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
+        accumulator.reset()
+        self.assertEqual(accumulator.step, 0)
+        self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
+
+    def testGradientAccumulatorDistributionStrategy(self):
+        context._context = None
+        ops.enable_eager_execution_internal()
+        physical_devices = tf.config.experimental.list_physical_devices("CPU")
+        tf.config.experimental.set_virtual_device_configuration(
+            physical_devices[0],
+            [tf.config.experimental.VirtualDeviceConfiguration(),
+            tf.config.experimental.VirtualDeviceConfiguration()])
+
+        devices = tf.config.experimental.list_logical_devices(device_type="CPU")
+        strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])
+
+        with strategy.scope():
+            accumulator = GradientAccumulator()
+            variable = tf.Variable([4.0, 3.0])
+            optimizer = create_optimizer(5e-5, 10, 5)
+            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
+
+        def accumulate_on_replica(gradient):
+            accumulator([gradient])
+
+        def apply_on_replica():
+            optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])), 1.0)
+
+        @tf.function
+        def accumulate(grad1, grad2):
+            with strategy.scope():
+                gradient_placeholder.values[0].assign(grad1)
+                gradient_placeholder.values[1].assign(grad2)
+                strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,))
+
+        @tf.function
+        def apply_grad():
+            with strategy.scope():
+                strategy.experimental_run_v2(apply_on_replica)
+
+        accumulate([1.0, 2.0], [-1.0, 1.0])
+        accumulate([3.0, -1.0], [-1.0, -1.0])
+        accumulate([-2.0, 2.0], [3.0, -2.0])
+        self.assertEqual(accumulator.step, 3)
+        self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [2.0, 3.0], tol=1e-2)
+        self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [1.0, -2.0], tol=1e-2)
+        apply_grad()
+        self.assertListAlmostEqual(variable.value().numpy().tolist(), [4.0, 3.0], tol=1e-2)
+        accumulator.reset()
+        self.assertEqual(accumulator.step, 0)
+        self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
+        self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 18fb93530ba0c1f6a45240270b24dc5c5da340ae Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Dec 2019 14:36:34 +0100
Subject: [PATCH 28/91] fixing #2042 - Nicer error message

---
 transformers/modeling_bert.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 5f92fb96a3..1ee3e3f097 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -667,11 +667,10 @@ class BertModel(BertPreTrainedModel):
         # ourselves in which case we just need to make it broadcastable to all heads.
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
-
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if attention_mask.dim() == 2:
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
             if self.config.is_decoder:
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
@@ -679,6 +678,8 @@ class BertModel(BertPreTrainedModel):
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
@@ -696,8 +697,11 @@ class BertModel(BertPreTrainedModel):
 
             if encoder_attention_mask.dim() == 3:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-            if encoder_attention_mask.dim() == 2:
+            elif encoder_attention_mask.dim() == 2:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+            else:
+                raise ValueError("Wrong shape for input_ids (shape {}) or encoder_attention_mask (shape {})".format(input_shape,
+                                                                                                                    encoder_attention_mask.shape))
 
             encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0

From f8fb4335c9cd79789ed6119e729348e0a1b51e2b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Dec 2019 15:19:32 +0100
Subject: [PATCH 29/91] clean up a little bit PT <=> TF conversion

---
 transformers/convert_pytorch_checkpoint_to_tf2.py | 9 +++++----
 transformers/modeling_utils.py                    | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index d1776e9c14..d20eafe2e9 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -119,10 +119,11 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
         tf_inputs = tf.constant(inputs_list)
         tfo = tf_model(tf_inputs, training=False)  # build the network
 
-        pt_model = pt_model_class.from_pretrained(None,
-                                                  config=config,
-                                                  state_dict=torch.load(pytorch_checkpoint_path,
-                                                                        map_location='cpu'))
+        pt_model = pt_model_class(config)
+        pt_model.load_state_dict(torch.load(pytorch_checkpoint_path, map_location='cpu'),
+                                 strict-False)
+        pt_model.eval()
+
         pt_inputs = torch.tensor(inputs_list)
         with torch.no_grad():
             pto = pt_model(pt_inputs)
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 398172a88c..3ac568771e 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -318,7 +318,8 @@ class PreTrainedModel(nn.Module):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
+        if pretrained_model_name_or_path is not None and (
+                "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
             logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
                            "https://github.com/google-research/google-research/issues/119 for more information.")
 

From 9ecd83dace3961eaa161405814b00ea595c86451 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 5 Dec 2019 14:44:57 -0500
Subject: [PATCH 30/91] Patch evaluation for impossible values + cleanup

---
 docs/source/main_classes/processors.rst |  4 ++--
 examples/run_squad.py                   | 25 +++++--------------------
 transformers/data/processors/squad.py   |  6 +++---
 transformers/tokenization_utils.py      |  2 +-
 4 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst
index ce0eeb553a..e98910ae1b 100644
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -55,7 +55,7 @@ Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 An example using these processors is given in the
-`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+`run_glue.py <https://github.com/huggingface/transformers/blob/master/examples/run_glue.py>`__ script.
 
 
 
@@ -132,4 +132,4 @@ Example::
 
 
 Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
+`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 3f1b6a798f..5caff9ae4f 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -311,7 +311,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         str(args.max_seq_length)))
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features_and_dataset = torch.load(cached_features_file)
+        features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
     else:
         logger.info("Creating features from dataset file at %s", input_dir)
 
@@ -330,40 +331,24 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
             examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
 
-        features = squad_convert_examples_to_features( 
+        features, dataset = squad_convert_examples_to_features( 
             examples=examples,
             tokenizer=tokenizer,
             max_seq_length=args.max_seq_length,
             doc_stride=args.doc_stride,
             max_query_length=args.max_query_length,
             is_training=not evaluate,
+            return_dataset='pt'
         )
 
 
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)
 
     if args.local_rank == 0 and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-    if evaluate:
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
-    else:
-        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
-
     if output_examples:
         return dataset, examples, features
     return dataset
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 338bae0c51..bb56aa792f 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -312,7 +312,7 @@ class SquadProcessor(DataProcessor):
         if not evaluate:
             answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
             answer_start = tensor_dict['answers']['answer_start'][0].numpy()
-            answers = None
+            answers = []
         else:
             answers = [{
                 "answer_start": start.numpy(), 
@@ -408,7 +408,7 @@ class SquadProcessor(DataProcessor):
                     question_text = qa["question"]
                     start_position_character = None
                     answer_text = None
-                    answers = None
+                    answers = []
                     
                     if "is_impossible" in qa:
                         is_impossible = qa["is_impossible"]
@@ -469,7 +469,7 @@ class SquadExample(object):
                  answer_text,
                  start_position_character,
                  title,
-                 answers=None,
+                 answers=[],
                  is_impossible=False):
         self.qas_id = qas_id
         self.question_text = question_text
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 41a611ea49..5ec173bbf6 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -194,7 +194,7 @@ class PreTrainedTokenizer(object):
 
     @property
     def pad_token_type_id(self):
-        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+        """ Id of the padding token type in the vocabulary."""
         return self._pad_token_type_id
 
     @property

From e9217da5ff711cf84d150b35d3f8a5c17f1641f7 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 5 Dec 2019 16:01:51 -0500
Subject: [PATCH 31/91] Cleanup

Improve global visibility on the run_squad script, remove unused files and fixes related to XLNet.
---
 examples/run_squad.py                      |   69 +-
 examples/utils_squad.py                    | 1017 --------------------
 examples/utils_squad_evaluate.py           |  330 -------
 transformers/data/metrics/squad_metrics.py |   14 +-
 transformers/data/processors/squad.py      |    2 +-
 5 files changed, 45 insertions(+), 1387 deletions(-)
 delete mode 100644 examples/utils_squad.py
 delete mode 100644 examples/utils_squad_evaluate.py

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 5caff9ae4f..6d32211c0c 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -27,8 +27,7 @@ import glob
 import timeit
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -48,14 +47,6 @@ from transformers import (WEIGHTS_NAME, BertConfig,
 
 from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
 
-from utils_squad import (convert_examples_to_features as old_convert, read_squad_examples as old_read, RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
-
-# The follwing import is the official SQuAD evaluation script (2.0).
-# You can remove it from the dependencies if you are using this script outside of the library
-# We've added it here for automated tests (see examples/test_examples.py file)
-from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
-
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
@@ -98,14 +89,16 @@ def train(args, train_dataset, model, tokenizer):
     optimizer_grouped_parameters = [
         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+
     if args.fp16:
         try:
             from apex import amp
         except ImportError:
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        
         model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
 
     # multi-gpu training (should be after apex fp16 initialization)
@@ -133,20 +126,26 @@ def train(args, train_dataset, model, tokenizer):
     model.zero_grad()
     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      'start_positions': batch[3],
-                      'end_positions':   batch[4]}
+
+            inputs = {
+                'input_ids':       batch[0],
+                'attention_mask':  batch[1],
+                'start_positions': batch[3],
+                'end_positions':   batch[4]
+            }
+
             if args.model_type != 'distilbert':
                 inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+
             if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+                inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
+
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
@@ -173,8 +172,8 @@ def train(args, train_dataset, model, tokenizer):
                 model.zero_grad()
                 global_step += 1
 
+                # Log metrics
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
@@ -183,8 +182,8 @@ def train(args, train_dataset, model, tokenizer):
                     tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                     logging_loss = tr_loss
 
+                # Save model checkpoint
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
                     output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
@@ -213,6 +212,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         os.makedirs(args.output_dir)
 
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
@@ -225,11 +225,14 @@ def evaluate(args, model, tokenizer, prefix=""):
     logger.info("***** Running evaluation {} *****".format(prefix))
     logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
+
     all_results = []
     start_time = timeit.default_timer()
+
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
+
         with torch.no_grad():
             inputs = {
                 'input_ids':      batch[0],
@@ -238,10 +241,13 @@ def evaluate(args, model, tokenizer, prefix=""):
             
             if args.model_type != 'distilbert':
                 inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+
             example_indices = batch[3]
+            
+            # XLNet and XLM use more arguments for their predictions
             if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
+
             outputs = model(**inputs)
 
         for i, example_index in enumerate(example_indices):
@@ -250,11 +256,13 @@ def evaluate(args, model, tokenizer, prefix=""):
 
             output = [to_list(output[i]) for output in outputs]
 
+            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
+            # models only use two.
             if len(output) >= 5:
                 start_logits = output[0]
                 start_top_index = output[1]
                 end_logits = output[2]
-                end_top_index = output[3],
+                end_top_index = output[3]
                 cls_logits = output[4]
 
                 result = SquadResult(
@@ -278,16 +286,17 @@ def evaluate(args, model, tokenizer, prefix=""):
     # Compute predictions
     output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
     output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+
     if args.version_2_with_negative:
         output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
     else:
         output_null_log_odds_file = None
 
+    # XLNet and XLM use a more complex post-processing procedure
     if args.model_type in ['xlnet', 'xlm']:
-        # XLNet uses a more complex post-processing procedure
         predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        output_nbest_file, output_null_log_odds_file,
                         model.config.start_n_top, model.config.end_n_top,
                         args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
@@ -296,6 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""):
                         output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                         args.version_2_with_negative, args.null_score_diff_threshold)
 
+    # Compute the F1 and exact scores.
     results = squad_evaluate(examples, predictions)
     return results
 
@@ -308,7 +318,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
         'dev' if evaluate else 'train',
         list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+        str(args.max_seq_length))
+    )
+
+    # Init features and dataset from cache if it exists
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features_and_dataset = torch.load(cached_features_file)
@@ -341,7 +354,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             return_dataset='pt'
         )
 
-
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save({"features": features, "dataset": dataset}, cached_features_file)
@@ -452,6 +464,11 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
+    args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format(
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length))
+    )
+
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
 
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
deleted file mode 100644
index 4f1c581588..0000000000
--- a/examples/utils_squad.py
+++ /dev/null
@@ -1,1017 +0,0 @@
-
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Load SQuAD dataset. """
-
-from __future__ import absolute_import, division, print_function
-
-import json
-import logging
-import math
-import collections
-from io import open
-from tqdm import tqdm
-
-from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
-
-# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
-from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
-
-logger = logging.getLogger(__name__)
-
-
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
-    """
-
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 cls_index,
-                 p_mask,
-                 paragraph_len,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-        self.paragraph_len = paragraph_len
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=0, pad_token_segment_id=0,
-                                 mask_padding_with_zero=True,
-                                 sequence_a_is_doc=False):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-    # cnt_pos, cnt_neg = 0, 0
-    # max_N, max_M = 1024, 1024
-    # f = np.zeros((max_N, max_M), dtype=np.float32)
-
-    features = []
-    for (example_index, example) in enumerate(tqdm(examples)):
-
-        # if example_index % 100 == 0:
-        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
-
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        assert max_tokens_for_doc > 0
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-
-            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-            # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = []
-
-            # CLS token at the beginning
-            if not cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = 0
-
-            # XLNet: P SEP Q SEP CLS
-            # Others: CLS Q SEP P SEP
-            if not sequence_a_is_doc:
-                # Query
-                tokens += query_tokens
-                segment_ids += [sequence_a_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
-
-                # SEP token
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-            # Paragraph
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                if not sequence_a_is_doc:
-                    segment_ids.append(sequence_b_segment_id)
-                else:
-                    segment_ids.append(sequence_a_segment_id)
-                p_mask.append(0)
-            paragraph_len = doc_span.length
-
-            if sequence_a_is_doc:
-                # SEP token
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-                tokens += query_tokens
-                segment_ids += [sequence_b_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
-
-            # SEP token
-            tokens.append(sep_token)
-            segment_ids.append(sequence_b_segment_id)
-            p_mask.append(1)
-
-            # CLS token at the end
-            if cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = len(tokens) - 1  # Index of classification token
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(pad_token)
-                input_mask.append(0 if mask_padding_with_zero else 1)
-                segment_ids.append(pad_token_segment_id)
-                p_mask.append(1)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            span_is_impossible = example.is_impossible
-            start_position = None
-            end_position = None
-            if is_training and not span_is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                    span_is_impossible = True
-                else:
-                    if sequence_a_is_doc:
-                        doc_offset = 0
-                    else:
-                        doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-
-            if is_training and span_is_impossible:
-                start_position = cls_index
-                end_position = cls_index
-
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and span_is_impossible:
-                    logger.info("impossible example")
-                if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    cls_index=cls_index,
-                    p_mask=p_mask,
-                    paragraph_len=paragraph_len,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=span_is_impossible))
-            unique_id += 1
-
-    return features
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    logger.info("Writing predictions to: %s" % (output_prediction_file))
-    logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min null score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)
-
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-        # if we didn't include the empty option in the n-best, include it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-                
-            # In very rare edge cases we could only have single null prediction.
-            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
-                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions
-
-
-# For XLNet (and XLM which uses the same head)
-RawResultExtended = collections.namedtuple("RawResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index",
-     "end_top_log_probs", "end_top_index", "cls_logits"])
-
-
-def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
-                                max_answer_length, output_prediction_file,
-                                output_nbest_file,
-                                output_null_log_odds_file, orig_data_file,
-                                start_n_top, end_n_top, version_2_with_negative,
-                                tokenizer, verbose_logging):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
-
-        Requires utils_squad_evaluate.py
-    """
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index",
-        "start_log_prob", "end_log_prob"])
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
-
-    logger.info("Writing predictions to: %s", output_prediction_file)
-    # logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-
-            cur_null_score = result.cls_logits
-
-            # if we could have irrelevant answers, get the min score of irrelevant
-            score_null = min(score_null, cur_null_score)
-
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_log_prob = result.start_top_log_probs[i]
-                    start_index = result.start_top_index[i]
-
-                    j_index = i * end_n_top + j
-
-                    end_log_prob = result.end_top_log_probs[j_index]
-                    end_index = result.end_top_index[j_index]
-
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= feature.paragraph_len - 1:
-                        continue
-                    if end_index >= feature.paragraph_len - 1:
-                        continue
-
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob))
-
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_log_prob + x.end_log_prob),
-            reverse=True)
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-
-            # XLNet un-tokenizer
-            # Let's keep it simple for now and see if we need all this later.
-            # 
-            # tok_start_to_orig_index = feature.tok_start_to_orig_index
-            # tok_end_to_orig_index = feature.tok_end_to_orig_index
-            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
-            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
-            # paragraph_text = example.paragraph_text
-            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
-
-            # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
-                                        verbose_logging)
-
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_log_prob=pred.start_log_prob,
-                    end_log_prob=pred.end_log_prob))
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="", start_log_prob=-1e6,
-                end_log_prob=-1e6))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_log_prob + entry.end_log_prob)
-            if not best_non_null_entry:
-                best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_log_prob"] = entry.start_log_prob
-            output["end_log_prob"] = entry.end_log_prob
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-        assert best_non_null_entry is not None
-
-        score_diff = score_null
-        scores_diff_json[example.qas_id] = score_diff
-        # note(zhiliny): always predict best_non_null_entry
-        # and the evaluation script will search for the best threshold
-        all_predictions[example.qas_id] = best_non_null_entry.text
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    with open(orig_data_file, "r", encoding='utf-8') as reader:
-        orig_data = json.load(reader)["data"]
-
-    qid_to_has_ans = make_qid_to_has_ans(orig_data)
-    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
-    out_eval = {}
-
-    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
-
-    return out_eval
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
diff --git a/examples/utils_squad_evaluate.py b/examples/utils_squad_evaluate.py
deleted file mode 100644
index ed162e6fe6..0000000000
--- a/examples/utils_squad_evaluate.py
+++ /dev/null
@@ -1,330 +0,0 @@
-""" Official evaluation script for SQuAD version 2.0.
-    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
-
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
-"""
-import argparse
-import collections
-import json
-import numpy as np
-import os
-import re
-import string
-import sys
-
-class EVAL_OPTS():
-  def __init__(self, data_file, pred_file, out_file="",
-               na_prob_file="na_prob.json", na_prob_thresh=1.0,
-               out_image_dir=None, verbose=False):
-    self.data_file = data_file
-    self.pred_file = pred_file
-    self.out_file = out_file
-    self.na_prob_file = na_prob_file
-    self.na_prob_thresh = na_prob_thresh
-    self.out_image_dir = out_image_dir
-    self.verbose = verbose
-
-OPTS = None
-
-def parse_args():
-  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
-  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
-  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
-  parser.add_argument('--out-file', '-o', metavar='eval.json',
-                      help='Write accuracy metrics to file (default is stdout).')
-  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
-                      help='Model estimates of probability of no answer.')
-  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
-                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
-  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
-                      help='Save precision-recall curves to directory.')
-  parser.add_argument('--verbose', '-v', action='store_true')
-  if len(sys.argv) == 1:
-    parser.print_help()
-    sys.exit(1)
-  return parser.parse_args()
-
-def make_qid_to_has_ans(dataset):
-  qid_to_has_ans = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid_to_has_ans[qa['id']] = bool(qa['answers'])
-  return qid_to_has_ans
-
-def normalize_answer(s):
-  """Lower text and remove punctuation, articles and extra whitespace."""
-  def remove_articles(text):
-    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-    return re.sub(regex, ' ', text)
-  def white_space_fix(text):
-    return ' '.join(text.split())
-  def remove_punc(text):
-    exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
-  def lower(text):
-    return text.lower()
-  return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-def get_tokens(s):
-  if not s: return []
-  return normalize_answer(s).split()
-
-def compute_exact(a_gold, a_pred):
-  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-def compute_f1(a_gold, a_pred):
-  gold_toks = get_tokens(a_gold)
-  pred_toks = get_tokens(a_pred)
-  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-  num_same = sum(common.values())
-  if len(gold_toks) == 0 or len(pred_toks) == 0:
-    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-    return int(gold_toks == pred_toks)
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(pred_toks)
-  recall = 1.0 * num_same / len(gold_toks)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-
-def get_raw_scores(dataset, preds):
-  exact_scores = {}
-  f1_scores = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid = qa['id']
-        gold_answers = [a['text'] for a in qa['answers']
-                        if normalize_answer(a['text'])]
-        if not gold_answers:
-          # For unanswerable questions, only correct answer is empty string
-          gold_answers = ['']
-        if qid not in preds:
-          print('Missing prediction for %s' % qid)
-          continue
-        a_pred = preds[qid]
-        # Take max over all gold answers
-        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
-        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
-  return exact_scores, f1_scores
-
-def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-  new_scores = {}
-  for qid, s in scores.items():
-    pred_na = na_probs[qid] > na_prob_thresh
-    if pred_na:
-      new_scores[qid] = float(not qid_to_has_ans[qid])
-    else:
-      new_scores[qid] = s
-  return new_scores
-
-def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-  if not qid_list:
-    total = len(exact_scores)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores.values()) / total),
-        ('f1', 100.0 * sum(f1_scores.values()) / total),
-        ('total', total),
-    ])
-  else:
-    total = len(qid_list)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-        ('total', total),
-    ])
-
-def merge_eval(main_eval, new_eval, prefix):
-  for k in new_eval:
-    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
-
-def plot_pr_curve(precisions, recalls, out_image, title):
-  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
-  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
-  plt.xlabel('Recall')
-  plt.ylabel('Precision')
-  plt.xlim([0.0, 1.05])
-  plt.ylim([0.0, 1.05])
-  plt.title(title)
-  plt.savefig(out_image)
-  plt.clf()
-
-def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
-                               out_image=None, title=None):
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  true_pos = 0.0
-  cur_p = 1.0
-  cur_r = 0.0
-  precisions = [1.0]
-  recalls = [0.0]
-  avg_prec = 0.0
-  for i, qid in enumerate(qid_list):
-    if qid_to_has_ans[qid]:
-      true_pos += scores[qid]
-    cur_p = true_pos / float(i+1)
-    cur_r = true_pos / float(num_true_pos)
-    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
-      # i.e., if we can put a threshold after this point
-      avg_prec += cur_p * (cur_r - recalls[-1])
-      precisions.append(cur_p)
-      recalls.append(cur_r)
-  if out_image:
-    plot_pr_curve(precisions, recalls, out_image, title)
-  return {'ap': 100.0 * avg_prec}
-
-def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, out_image_dir):
-  if out_image_dir and not os.path.exists(out_image_dir):
-    os.makedirs(out_image_dir)
-  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
-  if num_true_pos == 0:
-    return
-  pr_exact = make_precision_recall_eval(
-      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
-      title='Precision-Recall curve for Exact Match score')
-  pr_f1 = make_precision_recall_eval(
-      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
-      title='Precision-Recall curve for F1 score')
-  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
-  pr_oracle = make_precision_recall_eval(
-      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
-      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
-  merge_eval(main_eval, pr_exact, 'pr_exact')
-  merge_eval(main_eval, pr_f1, 'pr_f1')
-  merge_eval(main_eval, pr_oracle, 'pr_oracle')
-
-def histogram_na_prob(na_probs, qid_list, image_dir, name):
-  if not qid_list:
-    return
-  x = [na_probs[k] for k in qid_list]
-  weights = np.ones_like(x) / float(len(x))
-  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
-  plt.xlabel('Model probability of no-answer')
-  plt.ylabel('Proportion of dataset')
-  plt.title('Histogram of no-answer probability: %s' % name)
-  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
-  plt.clf()
-
-def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-  return 100.0 * best_score / len(scores), best_thresh
-
-def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-
-  has_ans_score, has_ans_cnt = 0, 0
-  for qid in qid_list:
-    if not qid_to_has_ans[qid]: continue
-    has_ans_cnt += 1
-
-    if qid not in scores: continue
-    has_ans_score += scores[qid]
-
-  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-
-def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-
-def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-  main_eval['has_ans_exact'] = has_ans_exact
-  main_eval['has_ans_f1'] = has_ans_f1
-
-def main(OPTS):
-  with open(OPTS.data_file) as f:
-    dataset_json = json.load(f)
-    dataset = dataset_json['data']
-  with open(OPTS.pred_file) as f:
-    preds = json.load(f)
-  if OPTS.na_prob_file:
-    with open(OPTS.na_prob_file) as f:
-      na_probs = json.load(f)
-  else:
-    na_probs = {k: 0.0 for k in preds}
-  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
-  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-  exact_raw, f1_raw = get_raw_scores(dataset, preds)
-  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
-                                        OPTS.na_prob_thresh)
-  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
-                                     OPTS.na_prob_thresh)
-  out_eval = make_eval_dict(exact_thresh, f1_thresh)
-  if has_ans_qids:
-    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
-    merge_eval(out_eval, has_ans_eval, 'HasAns')
-  if no_ans_qids:
-    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
-    merge_eval(out_eval, no_ans_eval, 'NoAns')
-  if OPTS.na_prob_file:
-    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
-  if OPTS.na_prob_file and OPTS.out_image_dir:
-    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, OPTS.out_image_dir)
-    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
-    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
-  if OPTS.out_file:
-    with open(OPTS.out_file, 'w') as f:
-      json.dump(out_eval, f)
-  else:
-    print(json.dumps(out_eval, indent=2))
-  return out_eval
-
-if __name__ == '__main__':
-  OPTS = parse_args()
-  if OPTS.out_image_dir:
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt 
-  main(OPTS)
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index f8449df045..0755c0ab7a 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -578,7 +578,6 @@ def compute_predictions_log_probs(
     output_prediction_file,
     output_nbest_file,
     output_null_log_odds_file,
-    orig_data_file,
     start_n_top,
     end_n_top,
     version_2_with_negative,
@@ -756,15 +755,4 @@ def compute_predictions_log_probs(
         with open(output_null_log_odds_file, "w") as writer:
             writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
-    with open(orig_data_file, "r", encoding='utf-8') as reader:
-        orig_data = json.load(reader)["data"]
-
-    qid_to_has_ans = make_qid_to_has_ans(orig_data)
-    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
-    out_eval = {}
-
-    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
-
-    return out_eval
+    return all_predictions
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index bb56aa792f..3d7f832540 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -9,7 +9,7 @@ from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
 from .utils import DataProcessor, InputExample, InputFeatures
 from ...file_utils import is_tf_available, is_torch_available
 
-if is_torch_available:
+if is_torch_available():
     import torch
     from torch.utils.data import TensorDataset
 

From d0383e4daf44557e56cd4cbc5dc95b1d35457768 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Fri, 6 Dec 2019 01:24:22 +0100
Subject: [PATCH 32/91] corrected documentation for past tensor shape for ctrl
 and gpt2 model

---
 transformers/modeling_ctrl.py    | 4 ++--
 transformers/modeling_gpt2.py    | 6 +++---
 transformers/modeling_tf_ctrl.py | 4 ++--
 transformers/modeling_tf_gpt2.py | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index 3a252941ac..97bcb14434 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -252,7 +252,7 @@ class CTRLModel(CTRLPreTrainedModel):
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
         **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
             should not be passed as input ids as they have already been computed.
@@ -438,7 +438,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
             should not be passed as input ids as they have already been computed.
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 35bc5c8d6e..96fd1c0607 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -329,7 +329,7 @@ class GPT2Model(GPT2PreTrainedModel):
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
         **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
             should not be passed as input ids as they have already been computed.
@@ -503,7 +503,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
             should not be passed as input ids as they have already been computed.
@@ -596,7 +596,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
             Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
         **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
             should not be passed as input ids as they have already been computed.
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 6d0d6a57ad..29ee5113a4 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -400,7 +400,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
         **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
         **past**:
-            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -462,7 +462,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         **past**:
-            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index aebe790114..c738e5e8e3 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -436,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
         **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
         **past**:
-            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -476,7 +476,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         **prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         **past**:
-            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -535,7 +535,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         **mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
             Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
         **past**:
-            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
             Can be used (see `past` input) to speed up sequential decoding.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)

From 1d87b37d100c69ff3b2c1a5dfd271b6cf777176e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 6 Dec 2019 15:30:09 +0100
Subject: [PATCH 33/91] updating

---
 .../convert_pytorch_checkpoint_to_tf2.py      | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index d20eafe2e9..2c419888e8 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -119,10 +119,10 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
         tf_inputs = tf.constant(inputs_list)
         tfo = tf_model(tf_inputs, training=False)  # build the network
 
-        pt_model = pt_model_class(config)
-        pt_model.load_state_dict(torch.load(pytorch_checkpoint_path, map_location='cpu'),
-                                 strict-False)
-        pt_model.eval()
+        state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
+        pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
+                                                  config=config,
+                                                  state_dict=state_dict)
 
         pt_inputs = torch.tensor(inputs_list)
         with torch.no_grad():
@@ -140,7 +140,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
 
 def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
-                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
+                                     compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
     assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
 
     if args_model_type is None:
@@ -188,13 +188,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
 
             if os.path.isfile(model_shortcut_name):
                 model_shortcut_name = 'converted_model'
+
             convert_pt_checkpoint_to_tf(model_type=model_type,
                                         pytorch_checkpoint_path=model_file,
                                         config_file=config_file,
                                         tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                         compare_with_pt_model=compare_with_pt_model)
-            os.remove(config_file)
-            os.remove(model_file)
+            if remove_cached_files:
+                os.remove(config_file)
+                os.remove(model_file)
 
 
 if __name__ == "__main__":
@@ -227,6 +229,9 @@ if __name__ == "__main__":
     parser.add_argument("--use_cached_models",
                         action='store_true',
                         help = "Use cached models if possible instead of updating to latest checkpoint versions.")
+    parser.add_argument("--remove_cached_files",
+                        action='store_true',
+                        help = "Remove pytorch models after conversion (save memory when converting in batches).")
     parser.add_argument("--only_convert_finetuned_models",
                         action='store_true',
                         help = "Only convert finetuned models.")
@@ -246,4 +251,5 @@ if __name__ == "__main__":
                                         config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
                                         compare_with_pt_model=args.compare_with_pt_model,
                                         use_cached_models=args.use_cached_models,
+                                        remove_cached_files=args.remove_cached_files,
                                         only_convert_finetuned_models=args.only_convert_finetuned_models)

From e4679cddced7d746427066a78e8079fb40e51528 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 6 Dec 2019 11:56:23 -0500
Subject: [PATCH 34/91] [cli] Uploads: add progress bar (#2078)

* [cli] Uploads: add progress bar

see https://github.com/huggingface/transformers/pull/2044#discussion_r354057827 for context

* rename + documentation

* Add auto-referential comment
---
 transformers/hf_api.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index c21592a838..3bbb6c567a 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -16,10 +16,11 @@ from __future__ import absolute_import, division, print_function
 
 import os
 from os.path import expanduser
-import six
 
 import requests
+import six
 from requests.exceptions import HTTPError
+from tqdm import tqdm
 
 ENDPOINT = "https://huggingface.co"
 
@@ -129,10 +130,13 @@ class HfApi:
         # Even though we presign with the correct content-type,
         # the client still has to specify it when uploading the file.
         with open(filepath, "rb") as f:
+            pf = TqdmProgressFileReader(f)
+
             r = requests.put(urls.write, data=f, headers={
                 "content-type": urls.type,
             })
             r.raise_for_status()
+            pf.close()
         return urls.access
 
     def list_objs(self, token):
@@ -148,6 +152,34 @@ class HfApi:
 
 
 
+class TqdmProgressFileReader:
+    """
+    Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
+    and override `f.read()` so as to display a tqdm progress bar.
+
+    see github.com/huggingface/transformers/pull/2078#discussion_r354739608
+    for implementation details.
+    """
+    def __init__(
+        self,
+        f   # type: io.BufferedReader
+    ):
+        self.f = f
+        self.total_size = os.fstat(f.fileno()).st_size # type: int
+        self.pbar = tqdm(total=self.total_size, leave=False)
+        if six.PY3:
+            # does not work unless PY3
+            # no big deal as the CLI does not currently support PY2 anyways.
+            self.read = f.read
+            f.read = self._read
+
+    def _read(self, n=-1):
+        self.pbar.update(n)
+        return self.read(n)
+
+    def close(self):
+        self.pbar.close()
+
 
 
 class HfFolder:

From 35401fe50fa3e460b2a4422630b017f106c79e03 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Fri, 6 Dec 2019 19:57:38 +0100
Subject: [PATCH 35/91] Remove dependency on pytest for running tests (#2055)

* Switch to plain unittest for skipping slow tests.

Add a RUN_SLOW environment variable for running them.

* Switch to plain unittest for PyTorch dependency.

* Switch to plain unittest for TensorFlow dependency.

* Avoid leaking open files in the test suite.

This prevents spurious warnings when running tests.

* Fix unicode warning on Python 2 when running tests.

The warning was:

    UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal

* Support running PyTorch tests on a GPU.

Reverts 27e015bd.

* Tests no longer require pytest.

* Make tests pass on cuda
---
 README.md                                     | 11 +++-
 docs/source/installation.md                   | 11 +++-
 setup.py                                      |  1 -
 .../tests/modeling_tf_xxx_test.py             |  7 +-
 .../tests/modeling_xxx_test.py                | 12 ++--
 transformers/modeling_openai.py               |  6 +-
 transformers/tests/conftest.py                | 31 ---------
 transformers/tests/modeling_albert_test.py    | 11 ++--
 transformers/tests/modeling_auto_test.py      | 14 ++--
 transformers/tests/modeling_bert_test.py      | 38 +++++------
 transformers/tests/modeling_common_test.py    | 43 ++++++++++---
 transformers/tests/modeling_ctrl_test.py      |  9 +--
 .../tests/modeling_distilbert_test.py         | 12 ++--
 .../tests/modeling_encoder_decoder_test.py    |  7 +-
 transformers/tests/modeling_gpt2_test.py      | 10 +--
 transformers/tests/modeling_openai_test.py    | 10 +--
 transformers/tests/modeling_roberta_test.py   | 22 ++++---
 transformers/tests/modeling_tf_albert_test.py |  7 +-
 transformers/tests/modeling_tf_auto_test.py   | 14 ++--
 transformers/tests/modeling_tf_bert_test.py   |  7 +-
 transformers/tests/modeling_tf_common_test.py |  8 +--
 transformers/tests/modeling_tf_ctrl_test.py   |  7 +-
 .../tests/modeling_tf_distilbert_test.py      |  7 +-
 transformers/tests/modeling_tf_gpt2_test.py   |  7 +-
 .../tests/modeling_tf_openai_gpt_test.py      |  7 +-
 .../tests/modeling_tf_roberta_test.py         | 19 +++---
 .../tests/modeling_tf_transfo_xl_test.py      |  7 +-
 transformers/tests/modeling_tf_xlm_test.py    |  7 +-
 transformers/tests/modeling_tf_xlnet_test.py  | 10 +--
 .../tests/modeling_transfo_xl_test.py         | 10 +--
 transformers/tests/modeling_xlm_test.py       | 12 ++--
 transformers/tests/modeling_xlnet_test.py     | 21 ++++--
 transformers/tests/optimization_test.py       |  6 +-
 transformers/tests/tokenization_auto_test.py  |  5 +-
 transformers/tests/tokenization_bert_test.py  |  4 +-
 .../tests/tokenization_distilbert_test.py     |  4 +-
 .../tests/tokenization_roberta_test.py        |  4 +-
 .../tests/tokenization_tests_commons.py       |  6 +-
 .../tests/tokenization_transfo_xl_test.py     |  6 +-
 transformers/tests/tokenization_utils_test.py |  6 +-
 transformers/tests/tokenization_xlm_test.py   |  4 +-
 transformers/tests/tokenization_xlnet_test.py |  4 +-
 transformers/tests/utils.py                   | 64 +++++++++++++++++++
 transformers/tokenization_albert.py           |  8 +--
 transformers/tokenization_ctrl.py             |  6 +-
 transformers/tokenization_gpt2.py             | 12 ++--
 transformers/tokenization_openai.py           |  6 +-
 transformers/tokenization_utils.py            | 13 ++--
 transformers/tokenization_xlm.py              |  8 ++-
 transformers/tokenization_xlnet.py            |  4 +-
 50 files changed, 344 insertions(+), 231 deletions(-)
 delete mode 100644 transformers/tests/conftest.py
 create mode 100644 transformers/tests/utils.py

diff --git a/README.md b/README.md
index ddeabe08d6..64ec631651 100644
--- a/README.md
+++ b/README.md
@@ -101,17 +101,26 @@ pip install [--editable] .
 
 A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
 
 Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
 
 You can run the tests from the root of the cloned repository with the commands:
 
+```bash
+python -m unittest discover -s transformers/tests -p "*test.py" -t .
+python -m unittest discover -s examples -p "*test.py" -t examples
+```
+
+or
+
 ```bash
 python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
+
 ### Do you want to run a Transformer model on a mobile device?
 
 You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 11beb1ab3a..6263f7604d 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -24,15 +24,24 @@ pip install [--editable] .
 
 An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
-Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
 
 Run all the tests from the root of the cloned repository with the commands:
 
+```bash
+python -m unittest discover -s transformers/tests -p "*test.py" -t .
+python -m unittest discover -s examples -p "*test.py" -t examples
+```
+
+or
+
 ``` bash
 python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
+
 ## OpenAI GPT original tokenization workflow
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
diff --git a/setup.py b/setup.py
index 25f503f8d0..c4af32df83 100644
--- a/setup.py
+++ b/setup.py
@@ -72,7 +72,6 @@ setup(
         'transformers-cli'
     ],
     # python_requires='>=3.5.0',
-    tests_require=['pytest'],
     classifiers=[
           'Intended Audience :: Science/Research',
           'License :: OSI Approved :: Apache Software License',
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 90837ca1ea..d7e576bf8b 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import XxxConfig, is_tf_available
 
@@ -33,10 +33,9 @@ if is_tf_available():
                                                TFXxxForTokenClassification,
                                                TFXxxForQuestionAnswering,
                                                TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in ['xxx-base-uncased']:
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index 8c0cc3cf32..bfc70921cd 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
@@ -31,10 +31,9 @@ if is_torch_available():
                                         XxxForQuestionAnswering, XxxForSequenceClassification,
                                         XxxForTokenClassification, XxxForMultipleChoice)
     from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = XxxModel(config=config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = XxxForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = XxxForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                    start_positions=sequence_labels, end_positions=sequence_labels)
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = XxxForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = XxxForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
             result = {
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index e88f55c3ea..4fe7ffee8b 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
 
     logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
 
-    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
-    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
+        names = json.load(names_handle)
+    with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
+        shapes = json.load(shapes_handle)
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
     init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
diff --git a/transformers/tests/conftest.py b/transformers/tests/conftest.py
deleted file mode 100644
index f809234cd5..0000000000
--- a/transformers/tests/conftest.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# content of conftest.py
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--runslow", action="store_true", default=False, help="run slow tests"
-    )
-    parser.addoption(
-        "--use_cuda", action="store_true", default=False, help="run tests on gpu"
-    )
-
-
-def pytest_configure(config):
-    config.addinivalue_line("markers", "slow: mark test as slow to run")
-
-
-def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runslow"):
-        # --runslow given in cli: do not skip slow tests
-        return
-    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-    for item in items:
-        if "slow" in item.keywords:
-            item.add_marker(skip_slow)
-
-@pytest.fixture
-def use_cuda(request):
-    """ Run test on gpu """
-    return request.config.getoption("--use_cuda")
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index 976feff9db..a14d66ae8f 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -18,22 +18,21 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
                               AlbertForSequenceClassification, AlbertForQuestionAnswering,
                               )
     from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class AlbertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
@@ -133,6 +132,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = AlbertModel(config=config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -150,6 +150,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = AlbertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -163,6 +164,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = AlbertForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                    start_positions=sequence_labels, end_positions=sequence_labels)
@@ -183,6 +185,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = AlbertForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
@@ -225,7 +228,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 6d2c7ec979..9b7d920bc8 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import logging
 
 from transformers import is_torch_available
 
+from .utils import require_torch, slow
+
 if is_torch_available():
     from transformers import (AutoConfig, BertConfig,
                                     AutoModel, BertModel,
@@ -33,12 +34,11 @@ if is_torch_available():
 
     from .modeling_common_test import (CommonTestCases, ids_tensor)
     from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class AutoModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
             for value in loading_info.values():
                 self.assertEqual(len(value), 0)
 
-    @pytest.mark.slow
+    @slow
     def test_lmhead_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForMaskedLM)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_classification_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForSequenceClassification)
 
-    @pytest.mark.slow
+    @slow
     def test_question_answering_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 6c93c9a187..539f66cd3f 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (BertConfig, BertModel, BertForMaskedLM,
@@ -31,11 +31,9 @@ if is_torch_available():
                               BertForQuestionAnswering, BertForSequenceClassification,
                               BertForTokenClassification, BertForMultipleChoice)
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
-@pytest.mark.usefixtures("use_cuda")
+@require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
@@ -67,7 +65,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                      num_labels=3,
                      num_choices=4,
                      scope=None,
-                     device='cpu',
                      ):
             self.parent = parent
             self.batch_size = batch_size
@@ -91,26 +88,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
             self.num_labels = num_labels
             self.num_choices = num_choices
             self.scope = scope
-            self.device = device
 
         def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(self.device)
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             input_mask = None
             if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(self.device)
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
             token_type_ids = None
             if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size).to(self.device)
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             sequence_labels = None
             token_labels = None
             choice_labels = None
             if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(self.device)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(self.device)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices).to(self.device)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -144,7 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertModel(config=config)
-            model.to(input_ids.device)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -161,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
             model = BertModel(config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
@@ -177,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -190,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
             model = BertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
@@ -204,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
+            model.to(torch_device)
             model.eval()
             loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
             result = {
@@ -217,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                                     masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
@@ -235,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                    start_positions=sequence_labels, end_positions=sequence_labels)
@@ -254,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = BertForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
@@ -268,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = BertForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
             result = {
@@ -282,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_choices = self.num_choices
             model = BertForMultipleChoice(config=config)
+            model.to(torch_device)
             model.eval()
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
@@ -313,10 +318,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_bert_model(self, use_cuda=False):
-        # ^^ This could be a real fixture
-        if use_cuda:
-            self.model_tester.device = "cuda"
+    def test_bert_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_model(*config_and_inputs)
 
@@ -356,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index baf1531403..80d5d95455 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -27,10 +27,11 @@ import uuid
 
 import unittest
 import logging
-import pytest
 
 from transformers import is_torch_available
 
+from .utils import require_torch, slow, torch_device
+
 if is_torch_available():
     import torch
     import numpy as np
@@ -38,8 +39,6 @@ if is_torch_available():
     from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
                                     BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -65,6 +64,7 @@ def _config_zero_init(config):
 
 class CommonTestCases:
 
+    @require_torch
     class CommonModelTester(unittest.TestCase):
 
         model_tester = None
@@ -79,6 +79,7 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 with torch.no_grad():
                     outputs = model(**inputs_dict)
@@ -86,12 +87,13 @@ class CommonTestCases:
                 with TemporaryDirectory() as tmpdirname:
                     model.save_pretrained(tmpdirname)
                     model = model_class.from_pretrained(tmpdirname)
+                    model.to(torch_device)
                     with torch.no_grad():
                         after_outputs = model(**inputs_dict)
 
                     # Make sure we don't have nans
-                    out_1 = after_outputs[0].numpy()
-                    out_2 = outputs[0].numpy()
+                    out_1 = after_outputs[0].cpu().numpy()
+                    out_2 = outputs[0].cpu().numpy()
                     out_1 = out_1[~np.isnan(out_1)]
                     out_2 = out_2[~np.isnan(out_2)]
                     max_diff = np.amax(np.abs(out_1 - out_2))
@@ -113,6 +115,7 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
                 self.assertEqual(first.ne(second).sum().item(), 0)
@@ -125,6 +128,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(**inputs_dict)
                 attentions = outputs[-1]
@@ -142,6 +146,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = True
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(**inputs_dict)
                 self.assertEqual(out_len+1, len(outputs))
@@ -181,6 +186,7 @@ class CommonTestCases:
             configs_no_init.torchscript = True
             for model_class in self.all_model_classes:
                 model = model_class(config=configs_no_init)
+                model.to(torch_device)
                 model.eval()
                 inputs = inputs_dict['input_ids']  # Let's keep only input_ids
 
@@ -201,7 +207,10 @@ class CommonTestCases:
                 except ValueError:
                     self.fail("Couldn't load module.")
 
+                model.to(torch_device)
                 model.eval()
+
+                loaded_model.to(torch_device)
                 loaded_model.eval()
 
                 model_params = model.parameters()
@@ -228,11 +237,12 @@ class CommonTestCases:
             configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
             for model_class in self.all_model_classes:
                 model = model_class(config=configs_no_init)
+                model.to(torch_device)
                 model.eval()
 
                 # Prepare head_mask
                 # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
                 head_mask[0, 0] = 0
                 head_mask[-1, :-1] = 0
                 head_mask.requires_grad_(requires_grad=True)
@@ -282,6 +292,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
@@ -310,6 +321,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
@@ -319,6 +331,7 @@ class CommonTestCases:
                     os.makedirs(directory)
                 model.save_pretrained(directory)
                 model = model_class.from_pretrained(directory)
+                model.to(torch_device)
 
                 outputs = model(**inputs_dict)
                 attentions = outputs[-1]
@@ -346,6 +359,7 @@ class CommonTestCases:
                 config.pruned_heads = heads_to_prune
 
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
 
                 outputs = model(**inputs_dict)
@@ -372,6 +386,7 @@ class CommonTestCases:
                 config.pruned_heads = heads_to_prune
 
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
 
                 outputs = model(**inputs_dict)
@@ -388,6 +403,7 @@ class CommonTestCases:
                     os.makedirs(directory)
                 model.save_pretrained(directory)
                 model = model_class.from_pretrained(directory)
+                model.to(torch_device)
                 shutil.rmtree(directory)
 
                 outputs = model(**inputs_dict)
@@ -419,6 +435,7 @@ class CommonTestCases:
                 config.output_hidden_states = True
                 config.output_attentions = False
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(**inputs_dict)
                 hidden_states = outputs[-1]
@@ -538,6 +555,7 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
 
                 wte = model.get_input_embeddings()
@@ -628,6 +646,7 @@ class CommonTestCases:
         def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                                 mc_labels, lm_labels, mc_token_ids):
             model = self.base_model_class(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids, position_ids, token_type_ids)
@@ -643,6 +662,7 @@ class CommonTestCases:
         def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                         mc_labels, lm_labels, mc_token_ids):
             model = self.lm_head_model_class(config)
+            model.to(torch_device)
             model.eval()
             outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
             loss, lm_logits = outputs[:2]
@@ -659,6 +679,7 @@ class CommonTestCases:
                                         mc_labels, lm_labels, mc_token_ids):
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(input_ids)
                 presents = outputs[-1]
@@ -671,6 +692,7 @@ class CommonTestCases:
         def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                         mc_labels, lm_labels, mc_token_ids):
             model = self.double_head_model_class(config)
+            model.to(torch_device)
             model.eval()
             outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                             token_type_ids=token_type_ids, position_ids=position_ids)
@@ -716,7 +738,7 @@ class CommonTestCases:
                 config_and_inputs = self.prepare_config_and_inputs()
                 self.create_and_check_presents(*config_and_inputs)
 
-        @pytest.mark.slow
+        @slow
         def run_slow_tests(self):
             self.create_and_check_model_from_pretrained()
 
@@ -770,7 +792,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
     for _ in range(total_dims):
         values.append(rng.randint(0, vocab_size - 1))
 
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 
 
 def floats_tensor(shape, scale=1.0, rng=None, name=None):
@@ -786,11 +808,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
 
 
+@require_torch
 class ModelUtilsTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index 47ff8d8d51..8c14578a5c 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -16,7 +16,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 import shutil
 import pdb
 
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
     from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     CTRLLMHeadModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class CTRLModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = CTRLModel(config=config)
+            model.to(torch_device)
             model.eval()
 
             model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = CTRLLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index 8099c03586..4b8f64327d 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 
 from transformers import is_torch_available
 
@@ -25,13 +24,13 @@ if is_torch_available():
     from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
                                     DistilBertForTokenClassification,
                                     DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertModel(config=config)
+            model.to(torch_device)
             model.eval()
             (sequence_output,) = model(input_ids, input_mask)
             (sequence_output,) = model(input_ids)
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
             result = {
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
             result = {
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = DistilBertForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
             result = {
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = DistilBertForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
 
             loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
@@ -229,7 +233,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
 
-    # @pytest.mark.slow
+    # @slow
     # def test_model_from_pretrained(self):
     #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py
index a6c88ed9a9..64e86df8f5 100644
--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -15,19 +15,18 @@
 
 import logging
 import unittest
-import pytest
 
 from transformers import is_torch_available
+from .utils import require_torch, slow
 
 if is_torch_available():
     from transformers import BertModel, BertForMaskedLM, Model2Model
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class EncoderDecoderModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model2model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index 4263e51bc9..ecaa2a4bd0 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 import shutil
 
 from transformers import is_torch_available
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
     from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2Model(config=config)
+            model.to(torch_device)
             model.eval()
 
             model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2LMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
             model = GPT2DoubleHeadsModel(config)
+            model.to(torch_device)
             model.eval()
 
 
@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index 33218288a0..8e4d13438d 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 import shutil
 
 from transformers import is_torch_available
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
     from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTModel(config=config)
+            model.to(torch_device)
             model.eval()
 
             model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTDoubleHeadsModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 0620ddf630..7a3553b164 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -27,13 +26,13 @@ if is_torch_available():
     from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
                               RobertaForSequenceClassification, RobertaForTokenClassification)
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class RobertaModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
@@ -129,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                            token_labels, choice_labels):
             model = RobertaModel(config=config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -146,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                                    token_labels, choice_labels):
             model = RobertaForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -161,6 +162,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                                                               sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = RobertaForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                  labels=token_labels)
@@ -195,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -207,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
 class RobertaModelIntegrationTest(unittest.TestCase):
 
-    @pytest.mark.slow
+    @slow
     def test_inference_masked_lm(self):
         model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        
+
         input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 11, 50265))
@@ -228,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
             torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_no_head(self):
         model = RobertaModel.from_pretrained('roberta-base')
-        
+
         input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
@@ -244,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
             torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_classification_head(self):
         model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
-        
+
         input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 3))
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index fbd519b8f6..7d3325b70b 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import AlbertConfig, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
                                                  TFAlbertForSequenceClassification,
                                                  TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (
@@ -216,7 +215,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
         self.model_tester.create_and_check_albert_for_sequence_classification(
             *config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index fa90906e86..7ea48015d9 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import logging
 
 from transformers import is_tf_available
 
+from .utils import require_tf, slow
+
 if is_tf_available():
     from transformers import (AutoConfig, BertConfig,
                                       TFAutoModel, TFBertModel,
@@ -33,12 +34,11 @@ if is_tf_available():
 
     from .modeling_common_test import (CommonTestCases, ids_tensor)
     from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFAutoModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         import h5py
         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
@@ -54,7 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertModel)
 
-    @pytest.mark.slow
+    @slow
     def test_lmhead_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -67,7 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForMaskedLM)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_classification_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -80,7 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForSequenceClassification)
 
-    @pytest.mark.slow
+    @slow
     def test_question_answering_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index bcee97435e..d7a86fecb9 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import BertConfig, is_tf_available
 
@@ -36,10 +36,9 @@ if is_tf_available():
                                                        TFBertForTokenClassification,
                                                        TFBertForQuestionAnswering,
                                                        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
@@ -309,7 +308,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 7445ce826a..439360ba35 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -25,18 +25,17 @@ import unittest
 import uuid
 import tempfile
 
-import pytest
 import sys
 
 from transformers import is_tf_available, is_torch_available
 
+from .utils import require_tf, slow
+
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
     from transformers import TFPreTrainedModel
     # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -62,6 +61,7 @@ def _config_zero_init(config):
 
 class TFCommonTestCases:
 
+    @require_tf
     class TFCommonModelTester(unittest.TestCase):
 
         model_tester = None
@@ -164,7 +164,7 @@ class TFCommonTestCases:
             for model_class in self.all_model_classes:
                 # Prepare our model
                 model = model_class(config)
-                
+
                 # Let's load it from the disk to be sure we can use pretrained weights
                 with TemporaryDirectory() as tmpdirname:
                     outputs = model(inputs_dict)  # build the model
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index a57c882169..0b421c20c9 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import CTRLConfig, is_tf_available
 
@@ -30,10 +30,9 @@ if is_tf_available():
     import tensorflow as tf
     from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
                                                 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
@@ -188,7 +187,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index e6d3795914..0ec45150ca 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -17,10 +17,10 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import DistilBertConfig, is_tf_available
 
@@ -30,10 +30,9 @@ if is_tf_available():
                                                              TFDistilBertForMaskedLM,
                                                              TFDistilBertForQuestionAnswering,
                                                              TFDistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
@@ -210,7 +209,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
 
-    # @pytest.mark.slow
+    # @slow
     # def test_model_from_pretrained(self):
     #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index 76e9ee2298..e070b72e65 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import GPT2Config, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
                                                        TFGPT2DoubleHeadsModel,
                                                        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
@@ -219,7 +218,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index d470c8862d..675e806c12 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import OpenAIGPTConfig, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
                                                          TFOpenAIGPTDoubleHeadsModel,
                                                          TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
@@ -218,7 +217,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index edbfa4e205..42440bf1b7 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -18,10 +18,10 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import RobertaConfig, is_tf_available
 
@@ -32,10 +32,9 @@ if is_tf_available():
                                                           TFRobertaForSequenceClassification,
                                                           TFRobertaForTokenClassification,
                                                           TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
@@ -191,7 +190,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -203,10 +202,10 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
 class TFRobertaModelIntegrationTest(unittest.TestCase):
 
-    @pytest.mark.slow
+    @slow
     def test_inference_masked_lm(self):
         model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        
+
         input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = [1, 11, 50265]
@@ -224,10 +223,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
             numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_no_head(self):
         model = TFRobertaModel.from_pretrained('roberta-base')
-        
+
         input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
@@ -240,10 +239,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
             numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_classification_head(self):
         model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
-        
+
         input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = [1, 3]
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 534fe39646..03e332bdc1 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -19,10 +19,10 @@ from __future__ import print_function
 import unittest
 import random
 import shutil
-import pytest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import TransfoXLConfig, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
                                                              TFTransfoXLLMHeadModel,
                                                              TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
@@ -204,7 +203,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index 1bd661bebf..a680b70367 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_tf_available
 
@@ -29,13 +28,13 @@ if is_tf_available():
                                       TFXLMForSequenceClassification,
                                       TFXLMForQuestionAnsweringSimple,
                                       TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 
+@require_tf
 class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
@@ -251,7 +250,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index a00a965570..94864b86f2 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -21,7 +21,6 @@ import unittest
 import json
 import random
 import shutil
-import pytest
 
 from transformers import XLNetConfig, is_tf_available
 
@@ -33,12 +32,13 @@ if is_tf_available():
                                                         TFXLNetForTokenClassification,
                                                         TFXLNetForQuestionAnsweringSimple,
                                                         TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
+
+@require_tf
 class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
@@ -304,7 +304,7 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
     def test_xlnet_lm_head(self):
         self.model_tester.set_seed()
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
 
     def test_xlnet_sequence_classif(self):
         self.model_tester.set_seed()
@@ -320,7 +320,7 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index f7b913da5b..647dd3724d 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 import unittest
 import random
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -27,12 +26,13 @@ if is_torch_available():
     import torch
     from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
     from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
+
+@require_torch
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
@@ -111,6 +111,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
         def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLModel(config)
+            model.to(torch_device)
             model.eval()
 
             hidden_states_1, mems_1 = model(input_ids_1)
@@ -140,6 +141,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             lm_logits_1, mems_1 = model(input_ids_1)
@@ -204,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
         output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
         self.model_tester.check_transfo_xl_lm_head_output(output_result)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index 0133febb58..f6b980767c 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -26,13 +25,13 @@ if is_torch_available():
     from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
                                       XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
     from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class XLMModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
@@ -148,6 +147,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMModel(config=config)
+            model.to(torch_device)
             model.eval()
             outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
             outputs = model(input_ids, langs=token_type_ids)
@@ -163,6 +163,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMWithLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
@@ -182,6 +183,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForQuestionAnsweringSimple(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids)
@@ -206,6 +208,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForQuestionAnswering(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids)
@@ -260,6 +263,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
 
             (logits,) = model(input_ids)
@@ -312,7 +316,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 38888d4488..56b6bb3f4d 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -21,7 +21,6 @@ import unittest
 import json
 import random
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -31,12 +30,13 @@ if is_torch_available():
     from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification,
                               XLNetForTokenClassification, XLNetForQuestionAnswering)
     from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
+
+@require_torch
 class XLNetModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification,
@@ -100,9 +100,9 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
 
             input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
+            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
             perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
+            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
             target_mapping[:, 0, -1] = 1.0  # predict last token
 
             sequence_labels = None
@@ -141,6 +141,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetModel(config)
+            model.to(torch_device)
             model.eval()
 
             _, _ = model(input_ids_1, input_mask=input_mask)
@@ -155,6 +156,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
 
             config.mem_len = 0
             model = XLNetModel(config)
+            model.to(torch_device)
             model.eval()
             no_mems_outputs = model(input_ids_1)
             self.parent.assertEqual(len(no_mems_outputs), 1)
@@ -169,6 +171,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                     target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetModel(config)
+            model.to(torch_device)
             model.eval()
 
             _, _, attentions = model(input_ids_1, target_mapping=target_mapping)
@@ -181,6 +184,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
@@ -221,6 +225,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetForQuestionAnswering(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids_1)
@@ -279,6 +284,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetForTokenClassification(config)
+            model.to(torch_device)
             model.eval()
 
             logits, mems_1 = model(input_ids_1)
@@ -311,6 +317,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
 
             logits, mems_1 = model(input_ids_1)
@@ -362,7 +369,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
     def test_xlnet_lm_head(self):
         self.model_tester.set_seed()
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
 
     def test_xlnet_sequence_classif(self):
         self.model_tester.set_seed()
@@ -379,7 +386,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
index ab9afbfcf7..cc10ad5908 100644
--- a/transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import os
-import pytest
 
 from transformers import is_torch_available
 
@@ -31,10 +30,9 @@ if is_torch_available():
                               get_cosine_schedule_with_warmup,
                               get_cosine_with_hard_restarts_schedule_with_warmup,
                               get_linear_schedule_with_warmup)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .tokenization_tests_commons import TemporaryDirectory
+from .utils import require_torch
 
 
 def unwrap_schedule(scheduler, num_steps=10):
@@ -58,6 +56,7 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
                 scheduler.load_state_dict(state_dict)
     return lrs
 
+@require_torch
 class OptimizationTest(unittest.TestCase):
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -80,6 +79,7 @@ class OptimizationTest(unittest.TestCase):
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
 
+@require_torch
 class ScheduleInitTest(unittest.TestCase):
     m = torch.nn.Linear(50, 50) if is_torch_available() else None
     optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 79370811e8..18346d2768 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -18,15 +18,16 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import logging
 
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
+from .utils import slow
+
 
 class AutoTokenizerTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_tokenizer_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index 73ea38e20a..f390248956 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_bert import (BasicTokenizer,
@@ -26,6 +25,7 @@ from transformers.tokenization_bert import (BasicTokenizer,
                                                     _is_whitespace, VOCAB_FILES_NAMES)
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertFalse(_is_punctuation(u"A"))
         self.assertFalse(_is_punctuation(u" "))
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
 
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index 77a487651d..e815eca672 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -16,13 +16,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_distilbert import (DistilBertTokenizer)
 
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
+from .utils import slow
 
 class DistilBertTokenizationTest(BertTokenizationTest):
 
@@ -31,7 +31,7 @@ class DistilBertTokenizationTest(BertTokenizationTest):
     def get_tokenizer(self, **kwargs):
         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index a27bf7d654..8ad0b59511 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -17,11 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import json
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 
 class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
@@ -79,7 +79,7 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
         )
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
 
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index 97cd555df3..faff003f4b 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -102,9 +102,11 @@ class CommonTestCases:
             with TemporaryDirectory() as tmpdirname:
 
                 filename = os.path.join(tmpdirname, u"tokenizer.bin")
-                pickle.dump(tokenizer, open(filename, "wb"))
+                with open(filename, "wb") as handle:
+                    pickle.dump(tokenizer, handle)
 
-                tokenizer_new = pickle.load(open(filename, "rb"))
+                with open(filename, "rb") as handle:
+                    tokenizer_new = pickle.load(handle)
 
             subwords_loaded = tokenizer_new.tokenize(text)
 
diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
index 4e99484b0c..5495ebd3a6 100644
--- a/transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers import is_torch_available
@@ -24,11 +23,12 @@ from transformers import is_torch_available
 if is_torch_available():
     import torch
     from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
-else:
-    pytestmark = pytest.mark.skip("Require Torch")  # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import require_torch
 
+
+@require_torch
 class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index 8630191c69..ff3f80ff7d 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -18,13 +18,14 @@ from __future__ import print_function
 
 import unittest
 import six
-import pytest
 
 from transformers import PreTrainedTokenizer
 from transformers.tokenization_gpt2 import GPT2Tokenizer
 
+from .utils import slow
+
 class TokenizerUtilsTest(unittest.TestCase):
-    @pytest.mark.slow
+
     def check_tokenizer_from_pretrained(self, tokenizer_class):
         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
         for model_name in s3_models[:1]:
@@ -41,6 +42,7 @@ class TokenizerUtilsTest(unittest.TestCase):
                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
                 self.assertIsInstance(special_tok_id, int)
 
+    @slow
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
 
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index 3ff6564e34..7582a46662 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -17,11 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import pytest
 
 from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -67,7 +67,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
 
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index 2e14ffeb82..b68495a796 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -16,11 +16,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 
 from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'fixtures/test_sentencepiece.model')
@@ -90,7 +90,7 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
new file mode 100644
index 0000000000..7a51ab612b
--- /dev/null
+++ b/transformers/tests/utils.py
@@ -0,0 +1,64 @@
+import os
+import unittest
+
+from distutils.util import strtobool
+
+from transformers.file_utils import _tf_available, _torch_available
+
+
+try:
+    run_slow = os.environ["RUN_SLOW"]
+except KeyError:
+    # RUN_SLOW isn't set, default to skipping slow tests.
+    _run_slow_tests = False
+else:
+    # RUN_SLOW is set, convert it to True or False.
+    try:
+        _run_slow_tests = strtobool(run_slow)
+    except ValueError:
+        # More values are supported, but let's keep the message simple.
+        raise ValueError("If set, RUN_SLOW must be yes or no.")
+
+
+def slow(test_case):
+    """
+    Decorator marking a test as slow.
+
+    Slow tests are skipped by default. Set the RUN_SLOW environment variable
+    to a truthy value to run them.
+
+    """
+    if not _run_slow_tests:
+        test_case = unittest.skip("test is slow")(test_case)
+    return test_case
+
+
+def require_torch(test_case):
+    """
+    Decorator marking a test that requires PyTorch.
+
+    These tests are skipped when PyTorch isn't installed.
+
+    """
+    if not _torch_available:
+        test_case = unittest.skip("test requires PyTorch")(test_case)
+    return test_case
+
+
+def require_tf(test_case):
+    """
+    Decorator marking a test that requires TensorFlow.
+
+    These tests are skipped when TensorFlow isn't installed.
+
+    """
+    if not _tf_available:
+        test_case = unittest.skip("test requires TensorFlow")(test_case)
+    return test_case
+
+
+if _torch_available:
+    # Set the USE_CUDA environment variable to select a GPU.
+    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+else:
+    torch_device = None
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 40a4b29206..6b92d07218 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -141,7 +141,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
                 cur_pieces = self.sp_model.EncodeAsPieces(
                     piece[:-1].replace(SPIECE_UNDERLINE, ''))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
@@ -225,9 +225,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An ALBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 
-        | first sequence    | second sequence     
-        
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+
         if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 9454cbbaf3..219f17c404 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -133,9 +133,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index 5fda709448..68c6101860 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -72,7 +72,7 @@ def bytes_to_unicode():
     """
     Returns list of utf-8 byte and a mapping to unicode strings.
     We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
-    
+
     The reversible bpe codes work on unicode strings.
     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            bpe_merges = merges_handle.read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
 
@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        return vocab_file, merge_file
\ No newline at end of file
+        return vocab_file, merge_file
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index 0efbdb37c0..a4c64b7020 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             self.nlp = BasicTokenizer(do_lower_case=True)
             self.fix_text = None
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 5d683629f0..4c6cbd8986 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -347,7 +347,7 @@ class PreTrainedTokenizer(object):
                     "We assumed '{}' was a path or url to a directory containing vocabulary files "
                     "named {} but couldn't find such vocabulary files at this path or url.".format(
                         pretrained_model_name_or_path, ', '.join(s3_models),
-                        pretrained_model_name_or_path, 
+                        pretrained_model_name_or_path,
                         list(cls.vocab_files_names.values())))
 
         # Get files from url, cache, or disk depending on the case
@@ -382,7 +382,8 @@ class PreTrainedTokenizer(object):
         # Did we saved some inputs and kwargs to reload ?
         tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
         if tokenizer_config_file is not None:
-            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+                init_kwargs = json.load(tokenizer_config_handle)
             saved_init_inputs = init_kwargs.pop('init_inputs', ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
@@ -407,7 +408,8 @@ class PreTrainedTokenizer(object):
             if args_name not in init_kwargs:
                 init_kwargs[args_name] = file_path
         if special_tokens_map_file is not None:
-            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                special_tokens_map = json.load(special_tokens_map_handle)
             for key, value in special_tokens_map.items():
                 if key not in init_kwargs:
                     init_kwargs[key] = value
@@ -421,7 +423,8 @@ class PreTrainedTokenizer(object):
 
         # Add supplementary tokens.
         if added_tokens_file is not None:
-            added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
             added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
             tokenizer.added_tokens_encoder.update(added_tok_encoder)
             tokenizer.added_tokens_decoder.update(added_tok_decoder)
@@ -937,7 +940,7 @@ class PreTrainedTokenizer(object):
             logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
-                           
+
         return encoded_inputs
 
     def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index ba994dc356..6c9f8e5e5c 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -524,7 +524,7 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
         (ex: "__classify__") to a vocabulary
-        
+
         - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
 
         - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
@@ -564,9 +564,11 @@ class XLMTokenizer(PreTrainedTokenizer):
         self.ja_word_tokenizer = None
         self.zh_word_tokenizer = None
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[:-1]
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index c01fbbbeeb..8c86a5bd60 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -141,7 +141,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
                 cur_pieces = self.sp_model.EncodeAsPieces(
                     piece[:-1].replace(SPIECE_UNDERLINE, ''))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
@@ -227,7 +227,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         An XLNet sequence pair mask has the following format:
         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
         | first sequence    | second sequence     | CLS segment ID
-        
+
         if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]

From 2670b0d682746e1fe94ab9c7b4d2fd7f4af03193 Mon Sep 17 00:00:00 2001
From: Michael Watkins <me@michaelwatkins.eu>
Date: Wed, 4 Dec 2019 17:53:25 +0200
Subject: [PATCH 36/91] Fix bug which lowercases special tokens

---
 transformers/tests/tokenization_tests_commons.py |  8 +++++---
 transformers/tokenization_utils.py               | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index faff003f4b..d904f0067e 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -115,8 +115,10 @@ class CommonTestCases:
         def test_added_tokens_do_lower_case(self):
             tokenizer = self.get_tokenizer(do_lower_case=True)
 
-            text = "aaaaa bbbbbb low cccccccccdddddddd l"
-            text2 = "AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l"
+            special_token = tokenizer.all_special_tokens[0]
+
+            text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
+            text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
 
             toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
 
@@ -141,7 +143,7 @@ class CommonTestCases:
 
             self.assertEqual(len(toks), len(toks2))  # Length should still be the same
             self.assertNotEqual(len(toks), len(toks0))
-            self.assertNotEqual(toks[0], toks2[0])  # But at least the first tokens should differ
+            self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
 
         def test_add_tokens_tokenizer(self):
             tokenizer = self.get_tokenizer()
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 4c6cbd8986..eb22c50ebd 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -22,6 +22,7 @@ import json
 import six
 import copy
 import itertools
+import re
 from io import open
 
 from .file_utils import cached_path, is_tf_available, is_torch_available
@@ -520,7 +521,7 @@ class PreTrainedTokenizer(object):
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
-            if self.init_kwargs.get('do_lower_case', False):
+            if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
                 token = token.lower()
             if token != self.unk_token and \
                     self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
@@ -615,8 +616,18 @@ class PreTrainedTokenizer(object):
 
             Take care of added tokens.
         """
+        def lowercase_text(t):
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
+            pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
+                      r'(.+?)'
+            return re.sub(
+                pattern,
+                lambda m: m.groups()[0] or m.groups()[1].lower(),
+                t)
+
         if self.init_kwargs.get('do_lower_case', False):
-            text = text.lower()
+            text = lowercase_text(text)
 
         def split_on_token(tok, text):
             result = []

From 0cb163865a4c761c226b151283309eedb2b1ca4d Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 7 Dec 2019 13:46:14 +0100
Subject: [PATCH 37/91] Remove pytest dependency. (#2093)

---
 transformers/tests/optimization_tf_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/transformers/tests/optimization_tf_test.py b/transformers/tests/optimization_tf_test.py
index ac5109cb56..515d12a158 100644
--- a/transformers/tests/optimization_tf_test.py
+++ b/transformers/tests/optimization_tf_test.py
@@ -3,18 +3,19 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 
 from transformers import is_tf_available
 
+from .utils import require_tf
+
 if is_tf_available():
     import tensorflow as tf
     from tensorflow.python.eager import context
     from tensorflow.python.framework import ops
     from transformers import (create_optimizer, GradientAccumulator)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
+
+@require_tf
 class OptimizationFTest(unittest.TestCase):
     def assertListAlmostEqual(self, list1, list2, tol):
         self.assertEqual(len(list1), len(list2))

From 3520be7824ad11ebc05a393fd90ecfdd4203cfdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Mon, 9 Dec 2019 11:13:09 +0100
Subject: [PATCH 38/91] create encoder attention mask from shape of hidden
 states

We currently create encoder attention masks (when they're not provided)
based on the shape of the inputs to the encoder. This is obviously
wrong; sequences can be of different lengths. We now create the encoder
attention mask based on the batch_size and sequence_length of the
encoder hidden states.
---
 transformers/modeling_bert.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 1ee3e3f097..8295cf4664 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -691,17 +691,19 @@ class BertModel(BertPreTrainedModel):
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
         # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder:
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(input_shape, device=device)
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
 
             if encoder_attention_mask.dim() == 3:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
             elif encoder_attention_mask.dim() == 2:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
             else:
-                raise ValueError("Wrong shape for input_ids (shape {}) or encoder_attention_mask (shape {})".format(input_shape,
-                                                                                                                    encoder_attention_mask.shape))
+                raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
+                                                                                                                               encoder_attention_mask.shape))
 
             encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0

From 2a4ef098d65939d436e2a5efbb518fb807b6b1b6 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 9 Dec 2019 10:46:47 -0500
Subject: [PATCH 39/91] Add ALBERT and XLM to SQuAD script

---
 examples/run_squad.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index a8ac1d1b05..2df29014ef 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -44,7 +44,9 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                   XLNetForQuestionAnswering,
                                   XLNetTokenizer,
                                   DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
-                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
+                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
+                                  )
 
 from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
 
@@ -58,7 +60,8 @@ MODEL_CLASSES = {
     'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
     'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
     'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
+    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
 }
 
 def set_seed(args):

From f71b1bb05a20879953d57bf648ab7bbd2b3239bc Mon Sep 17 00:00:00 2001
From: Bilal Khan <bk@tinymanager.com>
Date: Wed, 27 Nov 2019 08:39:00 -0600
Subject: [PATCH 40/91] Save optimizer state, scheduler state and current epoch

---
 examples/run_lm_finetuning.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index a5eaf524ac..3cae206460 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -224,7 +224,7 @@ def train(args, train_dataset, model, tokenizer):
     model.zero_grad()
     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
     set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
-    for _ in train_iterator:
+    for epoch in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
             inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
@@ -279,6 +279,10 @@ def train(args, train_dataset, model, tokenizer):
 
                     _rotate_checkpoints(args, checkpoint_prefix)
 
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(epoch, os.path.join(output_dir, 'training_state.pt'))
+
             if args.max_steps > 0 and global_step > args.max_steps:
                 epoch_iterator.close()
                 break

From a03fcf570de4a90218efd4b3de253d4648fe24b1 Mon Sep 17 00:00:00 2001
From: Bilal Khan <bk@tinymanager.com>
Date: Wed, 27 Nov 2019 18:42:07 -0600
Subject: [PATCH 41/91] Save tokenizer after each epoch to be able to resume
 training from a checkpoint

---
 examples/run_lm_finetuning.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 3cae206460..1d93aa4381 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -274,6 +274,8 @@ def train(args, train_dataset, model, tokenizer):
                         os.makedirs(output_dir)
                     model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
                     torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
@@ -282,6 +284,7 @@ def train(args, train_dataset, model, tokenizer):
                     torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
                     torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
                     torch.save(epoch, os.path.join(output_dir, 'training_state.pt'))
+                    logger.info("Saving training state to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
                 epoch_iterator.close()

From 0eb973b0d99e5c219af8a93b6267bda00c7161c6 Mon Sep 17 00:00:00 2001
From: Bilal Khan <bk@tinymanager.com>
Date: Wed, 27 Nov 2019 19:10:24 -0600
Subject: [PATCH 42/91] Use saved optimizer and scheduler states if available

---
 examples/run_lm_finetuning.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 1d93aa4381..9bdbf9ca56 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -188,6 +188,13 @@ def train(args, train_dataset, model, tokenizer):
         ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+
     if args.fp16:
         try:
             from apex import amp

From 2d73591a1831e80d0743b514d7f0138c4879e37b Mon Sep 17 00:00:00 2001
From: Bilal Khan <bk@tinymanager.com>
Date: Wed, 27 Nov 2019 19:13:10 -0600
Subject: [PATCH 43/91] Stop saving current epoch

---
 examples/run_lm_finetuning.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 9bdbf9ca56..5e7683b85d 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -290,8 +290,7 @@ def train(args, train_dataset, model, tokenizer):
 
                     torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
                     torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
-                    torch.save(epoch, os.path.join(output_dir, 'training_state.pt'))
-                    logger.info("Saving training state to %s", output_dir)
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
                 epoch_iterator.close()

From 9626e0458c20b61c18c9564ecc4d1261a4a66e50 Mon Sep 17 00:00:00 2001
From: Bilal Khan <bk@tinymanager.com>
Date: Wed, 27 Nov 2019 20:00:16 -0600
Subject: [PATCH 44/91] Add functionality to continue training from last saved
 global_step

---
 examples/run_lm_finetuning.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 5e7683b85d..172d4e20e2 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -223,17 +223,37 @@ def train(args, train_dataset, model, tokenizer):
     logger.info("  Total optimization steps = %d", t_total)
 
     global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        # set global_step to gobal_step of last saved checkpoint from model path
+        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from epoch %d", epochs_trained)
+        logger.info("  Continuing training from global step %d", global_step)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+
     tr_loss, logging_loss = 0.0, 0.0
 
     model_to_resize = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
     model_to_resize.resize_token_embeddings(len(tokenizer))
 
     model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
     set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
     for epoch in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
+            
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
             inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
             inputs = inputs.to(args.device)
             labels = labels.to(args.device)

From 79526f82f5d6757812f3691949cf03b864697f46 Mon Sep 17 00:00:00 2001
From: Bilal Khan <bk@tinymanager.com>
Date: Thu, 28 Nov 2019 19:20:29 -0600
Subject: [PATCH 45/91] Remove unnecessary epoch variable

---
 examples/run_lm_finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 172d4e20e2..c4c73e71af 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -245,7 +245,7 @@ def train(args, train_dataset, model, tokenizer):
     model.zero_grad()
     train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
     set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
-    for epoch in train_iterator:
+    for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
             

From 5c877fe94a1cbb70132515a9da6a464bf6da49ed Mon Sep 17 00:00:00 2001
From: Pierric Cistac <pierric@huggingface.co>
Date: Mon, 9 Dec 2019 18:53:00 -0500
Subject: [PATCH 46/91] fix albert links

---
 README.md                         |  2 +-
 docs/source/index.rst             |  2 +-
 docs/source/pretrained_models.rst | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 64ec631651..f3aa8a95ee 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://github.com/google-research/google-research/tree/master/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 55ead33b4d..84012fc6cf 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -49,7 +49,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
 9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
-11. `ALBERT <https://github.com/pytorch/fairseq/tree/master/examples/albert>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 090cb75808..dd61f11769 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -169,35 +169,35 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model                                                                                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model                                                                                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model                                                                                                                 |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model                                                                                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 

From 07f4cd73f6d13e43b69a6e34a2a756a80fc7f70b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 31 Oct 2019 09:48:27 +0100
Subject: [PATCH 47/91] update function to add special tokens

Since I started my PR the `add_special_token_single_sequence` function
has been deprecated for another; I replaced it with the new function.
---
 examples/utils_summarization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/utils_summarization.py b/examples/utils_summarization.py
index 327ca8cc3e..087c88bd4e 100644
--- a/examples/utils_summarization.py
+++ b/examples/utils_summarization.py
@@ -139,11 +139,11 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
     sentences.
     """
     story_lines_token_ids = [
-        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
+        tokenizer.build_inputs_with_special_tokens(tokenizer.encode(line))
         for line in story_lines
     ]
     summary_lines_token_ids = [
-        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
+        tokenizer.build_inputs_with_special_tokens(tokenizer.encode(line))
         for line in summary_lines
     ]
 

From 1c71ecc880ae8f04c8462e1368dc0678fdb92fc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 31 Oct 2019 10:16:08 +0100
Subject: [PATCH 48/91] load the pretrained weights for encoder-decoder

We currently save the pretrained_weights of the encoder and decoder in
two separate directories `encoder` and `decoder`. However, for the
`from_pretrained` function to operate with automodels we need to
specify the type of model in the path to the weights.

The path to the encoder/decoder weights is handled by the
`PreTrainedEncoderDecoder` class in the `save_pretrained` function. Sice
there is no easy way to infer the type of model that was initialized for
the encoder and decoder we add a parameter `model_type` to the function.
This is not an ideal solution as it is error prone, and the model type
should be carried by the Model classes somehow.

This is a temporary fix that should be changed before merging.
---
 examples/run_summarization_finetuning.py | 48 ++++++++++++++----------
 transformers/modeling_encoder_decoder.py | 31 +++++++++------
 2 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
index f5604c2669..9c2c7769c9 100644
--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
@@ -328,6 +328,22 @@ def evaluate(args, model, tokenizer, prefix=""):
     return result
 
 
+def save_model_checkpoints(args, model, tokenizer):
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    logger.info("Saving model checkpoint to %s", args.output_dir)
+
+    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+    # They can then be reloaded using `from_pretrained()`
+    model_to_save = (
+        model.module if hasattr(model, "module") else model
+    )  # Take care of distributed/parallel training
+    model_to_save.save_pretrained(args.output_dir, model_type='bert')
+    tokenizer.save_pretrained(args.output_dir)
+    torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -454,36 +470,30 @@ def main():
     # Train the model
     model.to(args.device)
     if args.do_train:
-        global_step, tr_loss = train(args, model, tokenizer)
+        try:
+            global_step, tr_loss = train(args, model, tokenizer)
+        except KeyboardInterrupt:
+            response = input("You interrupted the training. Do you want to save the model checkpoints? [Y/n]")
+            if response.lower() in ["", "y", "yes"]:
+                save_model_checkpoints(args, model, tokenizer)
+            sys.exit(0)
+
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-        if not os.path.exists(args.output_dir):
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-        torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
+        save_model_checkpoints(args, model, tokenizer)
 
     # Evaluate the model
     results = {}
     if args.do_evaluate:
-        checkpoints = []
+        checkpoints = [args.output_dir]
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            encoder_checkpoint = os.path.join(checkpoint, "encoder")
-            decoder_checkpoint = os.path.join(checkpoint, "decoder")
+            encoder_checkpoint = os.path.join(checkpoint, "bert_encoder")
+            decoder_checkpoint = os.path.join(checkpoint, "bert_decoder")
             model = PreTrainedEncoderDecoder.from_pretrained(
                 encoder_checkpoint, decoder_checkpoint
             )
             model.to(args.device)
-            results = "placeholder"
+            print("model loaded")
 
     return results
 
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index a884abd0a2..73322101d3 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -117,8 +117,7 @@ class PreTrainedEncoderDecoder(nn.Module):
         kwargs_common = {
             argument: value
             for argument, value in kwargs.items()
-            if not argument.startswith("encoder_")
-            and not argument.startswith("decoder_")
+            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
         }
         kwargs_decoder = kwargs_common.copy()
         kwargs_encoder = kwargs_common.copy()
@@ -158,14 +157,27 @@ class PreTrainedEncoderDecoder(nn.Module):
 
         return model
 
-    def save_pretrained(self, save_directory):
-        """ Save a Seq2Seq model and its configuration file in a format such
+    def save_pretrained(self, save_directory, model_type="bert"):
+        """ Save an EncoderDecoder model and its configuration file in a format such
         that it can be loaded using `:func:`~transformers.PreTrainedEncoderDecoder.from_pretrained`
 
         We save the encoder' and decoder's parameters in two separate directories.
+
+        If we want the weight loader to function we need to preprend the model
+        type to the directories' names. As far as I know there is no simple way
+        to infer the type of the model (except maybe by parsing the class'
+        names, which is not very future-proof). For now, we ask the user to
+        specify the model type explicitly when saving the weights.
         """
-        self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
-        self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
+        encoder_path = os.path.join(save_directory, "{}_encoder".format(model_type))
+        if not os.path.exists(encoder_path):
+            os.makedirs(encoder_path)
+        self.encoder.save_pretrained(encoder_path)
+
+        decoder_path = os.path.join(save_directory, "{}_decoder".format(model_type))
+        if not os.path.exists(decoder_path):
+            os.makedirs(decoder_path)
+        self.decoder.save_pretrained(decoder_path)
 
     def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
         """ The forward pass on a seq2eq depends what we are performing:
@@ -193,8 +205,7 @@ class PreTrainedEncoderDecoder(nn.Module):
         kwargs_common = {
             argument: value
             for argument, value in kwargs.items()
-            if not argument.startswith("encoder_")
-            and not argument.startswith("decoder_")
+            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
         }
         kwargs_decoder = kwargs_common.copy()
         kwargs_encoder = kwargs_common.copy()
@@ -217,9 +228,7 @@ class PreTrainedEncoderDecoder(nn.Module):
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[
-                0
-            ]  # output the last layer hidden state
+            encoder_hidden_states = encoder_outputs[0]  # output the last layer hidden state
         else:
             encoder_outputs = ()
 

From 9660ba1cbdec0e419937af06bd99f06fb5ebbf91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 31 Oct 2019 17:59:16 +0100
Subject: [PATCH 49/91] Add beam search

---
 examples/run_summarization_finetuning.py | 502 -----------------------
 examples/utils_summarization.py          |  20 +-
 transformers/generate/__init__.py        |   1 +
 transformers/generate/beam_search.py     | 358 ++++++++++++++++
 transformers/modeling_beam_search.py     | 271 ------------
 transformers/tests/beam_search_tests.py  | 226 ++++++++++
 6 files changed, 594 insertions(+), 784 deletions(-)
 delete mode 100644 examples/run_summarization_finetuning.py
 create mode 100644 transformers/generate/__init__.py
 create mode 100644 transformers/generate/beam_search.py
 delete mode 100644 transformers/modeling_beam_search.py
 create mode 100644 transformers/tests/beam_search_tests.py

diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
deleted file mode 100644
index 9c2c7769c9..0000000000
--- a/examples/run_summarization_finetuning.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# coding=utf-8
-# Copyright 2019 The HuggingFace Inc. team.
-# Copyright (c) 2019 The HuggingFace Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning seq2seq models for sequence generation."""
-
-import argparse
-import functools
-import logging
-import os
-import random
-import sys
-
-import numpy as np
-from tqdm import tqdm, trange
-import torch
-from torch.optim import Adam
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-
-from transformers import (
-    AutoTokenizer,
-    BertForMaskedLM,
-    BertConfig,
-    PreTrainedEncoderDecoder,
-    Model2Model,
-)
-
-from utils_summarization import (
-    CNNDailyMailDataset,
-    encode_for_summarization,
-    fit_to_block_size,
-    build_lm_labels,
-    build_mask,
-    compute_token_type_ids,
-)
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-
-# ------------
-# Load dataset
-# ------------
-
-
-def load_and_cache_examples(args, tokenizer):
-    dataset = CNNDailyMailDataset(tokenizer, data_dir=args.data_dir)
-    return dataset
-
-
-def collate(data, tokenizer, block_size):
-    """ List of tuple as an input. """
-    # remove the files with empty an story/summary, encode and fit to block
-    data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
-    data = [
-        encode_for_summarization(story, summary, tokenizer) for story, summary in data
-    ]
-    data = [
-        (
-            fit_to_block_size(story, block_size, tokenizer.pad_token_id),
-            fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
-        )
-        for story, summary in data
-    ]
-
-    stories = torch.tensor([story for story, summary in data])
-    summaries = torch.tensor([summary for story, summary in data])
-    encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
-    encoder_mask = build_mask(stories, tokenizer.pad_token_id)
-    decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
-    lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)
-
-    return (
-        stories,
-        summaries,
-        encoder_token_type_ids,
-        encoder_mask,
-        decoder_mask,
-        lm_labels,
-    )
-
-
-# ----------
-# Optimizers
-# ----------
-
-
-class BertSumOptimizer(object):
-    """ Specific optimizer for BertSum.
-
-    As described in [1], the authors fine-tune BertSum for abstractive
-    summarization using two Adam Optimizers with different warm-up steps and
-    learning rate. They also use a custom learning rate scheduler.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    """
-
-    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
-        self.encoder = model.encoder
-        self.decoder = model.decoder
-        self.lr = lr
-        self.warmup_steps = warmup_steps
-
-        self.optimizers = {
-            "encoder": Adam(
-                model.encoder.parameters(),
-                lr=lr["encoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
-            ),
-            "decoder": Adam(
-                model.decoder.parameters(),
-                lr=lr["decoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
-            ),
-        }
-
-        self._step = 0
-
-    def _update_rate(self, stack):
-        return self.lr[stack] * min(
-            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-0.5)
-        )
-
-    def zero_grad(self):
-        self.optimizer_decoder.zero_grad()
-        self.optimizer_encoder.zero_grad()
-
-    def step(self):
-        self._step += 1
-        for stack, optimizer in self.optimizers.items():
-            new_rate = self._update_rate(stack)
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = new_rate
-            optimizer.step()
-
-
-# ------------
-# Train
-# ------------
-
-
-def train(args, model, tokenizer):
-    """ Fine-tune the pretrained model on the corpus. """
-    set_seed(args)
-
-    # Load the data
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_dataset = load_and_cache_examples(args, tokenizer)
-    train_sampler = RandomSampler(train_dataset)
-    model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
-    train_dataloader = DataLoader(
-        train_dataset,
-        sampler=train_sampler,
-        batch_size=args.train_batch_size,
-        collate_fn=model_collate_fn,
-    )
-
-    # Training schedule
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = t_total // (
-            len(train_dataloader) // args.gradient_accumulation_steps + 1
-        )
-    else:
-        t_total = (
-            len(train_dataloader)
-            // args.gradient_accumulation_steps
-            * args.num_train_epochs
-        )
-
-    # Prepare the optimizer
-    lr = {"encoder": 0.002, "decoder": 0.2}
-    warmup_steps = {"encoder": 20000, "decoder": 10000}
-    optimizer = BertSumOptimizer(model, lr, warmup_steps)
-
-    # Train
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info(
-        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
-    )
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size * args.gradient_accumulation_steps
-        # * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    model.zero_grad()
-    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
-
-    global_step = 0
-    tr_loss = 0.0
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
-        for step, batch in enumerate(epoch_iterator):
-            source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
-
-            source = source.to(args.device)
-            target = target.to(args.device)
-            encoder_token_type_ids = encoder_token_type_ids.to(args.device)
-            encoder_mask = encoder_mask.to(args.device)
-            decoder_mask = decoder_mask.to(args.device)
-            lm_labels = lm_labels.to(args.device)
-
-            model.train()
-            outputs = model(
-                source,
-                target,
-                encoder_token_type_ids=encoder_token_type_ids,
-                encoder_attention_mask=encoder_mask,
-                decoder_attention_mask=decoder_mask,
-                decoder_lm_labels=lm_labels,
-            )
-
-            loss = outputs[0]
-            print(loss)
-            if args.gradient_accumulation_steps > 1:
-                loss /= args.gradient_accumulation_steps
-
-            loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                optimizer.step()
-                model.zero_grad()
-                global_step += 1
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    return global_step, tr_loss / global_step
-
-
-# ------------
-# Train
-# ------------
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    set_seed(args)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
-    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
-    )
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-        
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    model.eval()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
-
-        source = source.to(args.device)
-        target = target.to(args.device)
-        encoder_token_type_ids = encoder_token_type_ids.to(args.device)
-        encoder_mask = encoder_mask.to(args.device)
-        decoder_mask = decoder_mask.to(args.device)
-        lm_labels = lm_labels.to(args.device)
-
-        with torch.no_grad():
-            outputs = model(
-                source,
-                target,
-                encoder_token_type_ids=encoder_token_type_ids,
-                encoder_attention_mask=encoder_mask,
-                decoder_attention_mask=decoder_mask,
-                decoder_lm_labels=lm_labels,
-            )
-            lm_loss = outputs[0]
-            eval_loss += lm_loss.mean().item()
-        nb_eval_steps += 1
-
-    eval_loss = eval_loss / nb_eval_steps
-    perplexity = torch.exp(torch.tensor(eval_loss))
-
-    result = {"perplexity": perplexity}
-
-    # Save the evaluation's results
-    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def save_model_checkpoints(args, model, tokenizer):
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    logger.info("Saving model checkpoint to %s", args.output_dir)
-
-    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-    # They can then be reloaded using `from_pretrained()`
-    model_to_save = (
-        model.module if hasattr(model, "module") else model
-    )  # Take care of distributed/parallel training
-    model_to_save.save_pretrained(args.output_dir, model_type='bert')
-    tokenizer.save_pretrained(args.output_dir)
-    torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input training data file (a text file).",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Optional parameters
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--do_evaluate",
-        type=bool,
-        default=False,
-        help="Run model evaluation on out-of-sample data.",
-    )
-    parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
-    parser.add_argument(
-        "--do_overwrite_output_dir",
-        type=bool,
-        default=False,
-        help="Whether to overwrite the output dir.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default="bert-base-cased",
-        type=str,
-        help="The model checkpoint to initialize the encoder and decoder's weights with.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default="bert",
-        type=str,
-        help="The decoder architecture to be fine-tuned.",
-    )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument(
-        "--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
-    )
-    parser.add_argument(
-        "--num_train_epochs",
-        default=10,
-        type=int,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--per_gpu_train_batch_size",
-        default=4,
-        type=int,
-        help="Batch size per GPU/CPU for training.",
-    )
-    parser.add_argument("--seed", default=42, type=int)
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.do_overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
-                args.output_dir
-            )
-        )
-
-    # Set up training device
-    if args.to_cpu or not torch.cuda.is_available():
-        args.device = torch.device("cpu")
-        args.n_gpu = 0
-    else:
-        args.device = torch.device("cuda")
-        args.n_gpu = torch.cuda.device_count()
-
-    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    config = BertConfig.from_pretrained(args.model_name_or_path)
-    decoder_model = BertForMaskedLM(config)
-    model = Model2Model.from_pretrained(
-        args.model_name_or_path, decoder_model=decoder_model
-    )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        0,
-        args.device,
-        args.n_gpu,
-        False,
-        False,
-    )
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Train the model
-    model.to(args.device)
-    if args.do_train:
-        try:
-            global_step, tr_loss = train(args, model, tokenizer)
-        except KeyboardInterrupt:
-            response = input("You interrupted the training. Do you want to save the model checkpoints? [Y/n]")
-            if response.lower() in ["", "y", "yes"]:
-                save_model_checkpoints(args, model, tokenizer)
-            sys.exit(0)
-
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-        save_model_checkpoints(args, model, tokenizer)
-
-    # Evaluate the model
-    results = {}
-    if args.do_evaluate:
-        checkpoints = [args.output_dir]
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            encoder_checkpoint = os.path.join(checkpoint, "bert_encoder")
-            decoder_checkpoint = os.path.join(checkpoint, "bert_decoder")
-            model = PreTrainedEncoderDecoder.from_pretrained(
-                encoder_checkpoint, decoder_checkpoint
-            )
-            model.to(args.device)
-            print("model loaded")
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/utils_summarization.py b/examples/utils_summarization.py
index 087c88bd4e..7cbd4cd61b 100644
--- a/examples/utils_summarization.py
+++ b/examples/utils_summarization.py
@@ -25,9 +25,8 @@ class CNNDailyMailDataset(Dataset):
     [2] https://github.com/abisee/cnn-dailymail/
     """
 
-    def __init__(self, tokenizer, prefix="train", data_dir=""):
+    def __init__(self, data_dir="", prefix="train"):
         assert os.path.isdir(data_dir)
-        self.tokenizer = tokenizer
 
         # We initialize the class by listing all the files that contain
         # stories and summaries. Files are not read in memory given
@@ -104,31 +103,30 @@ def _add_missing_period(line):
 # --------------------------
 
 
-def fit_to_block_size(sequence, block_size, pad_token):
+def fit_to_block_size(sequence, block_size, pad_token_id):
     """ Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter than the block size we pad it with -1 ids
-    which correspond to padding tokens.
+    If the sequence is shorter we append padding token to the right of the sequence.
     """
     if len(sequence) > block_size:
         return sequence[:block_size]
     else:
-        sequence.extend([pad_token] * (block_size - len(sequence)))
+        sequence.extend([pad_token_id] * (block_size - len(sequence)))
         return sequence
 
 
-def build_lm_labels(sequence, pad_token):
-    """ Padding token, encoded as 0, are represented by the value -1 so they
+def build_lm_labels(sequence, pad_token_id):
+    """ Padding token are replaced by the value -1 so they
     are not taken into account in the loss computation. """
     padded = sequence.clone()
-    padded[padded == pad_token] = -1
+    padded[padded == pad_token_id] = -1
     return padded
 
 
-def build_mask(sequence, pad_token):
+def build_mask(sequence, pad_token_id):
     """ Builds the mask. The attention mechanism will only attend to positions
     with value 1. """
     mask = torch.ones_like(sequence)
-    idx_pad_tokens = sequence == pad_token
+    idx_pad_tokens = sequence == pad_token_id
     mask[idx_pad_tokens] = 0
     return mask
 
diff --git a/transformers/generate/__init__.py b/transformers/generate/__init__.py
new file mode 100644
index 0000000000..21ac612155
--- /dev/null
+++ b/transformers/generate/__init__.py
@@ -0,0 +1 @@
+from .beam_search import BeamSearch
diff --git a/transformers/generate/beam_search.py b/transformers/generate/beam_search.py
new file mode 100644
index 0000000000..09e340a150
--- /dev/null
+++ b/transformers/generate/beam_search.py
@@ -0,0 +1,358 @@
+# coding=utf-8
+# MIT License
+
+# Copyright (c) 2017-Present OpenNMT
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# of the Software, and to permit persons to whom the Software is furnished to do
+# so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Use Beam Search to generate sequences using encoder-decoder models.
+"""
+import torch
+from torch import nn
+
+
+class BeamSearch(nn.Module):
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        beam_size,
+        min_length,
+        max_length,
+        batch_size=1,
+        alpha=0,
+        block_repeating_trigrams=True,
+    ):
+        r"""
+        Inputs:
+            **model**: instance of ``transformers.PreTrainedEncoderDecoder``
+                The pretrained encoder-decoder model that will be used to generate the sequences.
+            **tokenizer**: instance of ``transformers.PreTrainedTokenizer``
+                The pretrained tokenizer associated to the model used in the encoder-decoder. We only
+                support encoder-decoder that use the same tokenizer for encoder and decoder. The tokenizer
+                needs to be initialized or this function will raise and exception.
+            **batch_size**: (`optional`) int
+                Batch size of the inputs. The value is set automatically when calling `forward`.
+            **beam_size**: int
+                Number of beams that are used for each element on the batch.
+            **min_length**: int
+                Minimum number of steps performed by the beam search before terminating.
+            **max_length**: int
+                Maximum number of steps performed by the beam search. Any beam that has not finished
+                will return its current solution with the highest probability. The sequence that is
+                returned has a length of max_length-1 to account for the end token that is subsequently added.
+            **alpha**: float
+                Parameter of the length penalty. Read the documentation of the `_length_penalty` method for mode details.
+            **block_repeating_trigrams**: bool
+                Whether to block sequences that have repeating 3-grams.
+        """
+        super(BeamSearch, self).__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+
+        self.bos_token_id = tokenizer.bos_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+
+        self.batch_size = batch_size
+        self.beam_size = beam_size
+        self.min_length = min_length
+        self.max_length = max_length
+
+        self.block_repeating_trigram = block_repeating_trigrams
+        self.apply_length_penalty = False if alpha == 0 else True
+        self.alpha = alpha
+
+        self._init_beam_state(batch_size)
+
+    def __len__(self):
+        try:
+            return self.growing_beams.size(1)
+        except NameError:
+            return 0
+
+    def _init_beam_state(self, batch_size):
+        """ (re-)Initialize the state of the beams. """
+        self.hypotheses = [[] for _ in range(batch_size)]
+        self.batch_offset = torch.arange(batch_size, dtype=torch.long)
+        self.beam_offset = torch.arange(
+            0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
+        )
+        self.growing_beams = torch.full(
+            (batch_size * self.beam_size, 1), self.bos_token_id, dtype=torch.long
+        )
+        self.topk_log_probabilities = torch.tensor(
+            [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
+        ).repeat(batch_size)
+        self.results = {
+            "predictions": [[] for _ in range(batch_size)],
+            "scores": [[] for _ in range(batch_size)],
+        }
+        self._step = 0
+        self.is_done = False
+
+    def forward(self, encoder_input_ids, **model_kwargs):
+        """ Generate a sequence using Beam Search. """
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
+        kwargs_common = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
+        }
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_encoder.update(
+            {
+                argument[len("encoder_") :]: value
+                for argument, value in model_kwargs.items()
+                if argument.startswith("encoder_")
+            }
+        )
+        kwargs_decoder.update(
+            {
+                argument[len("decoder_") :]: value
+                for argument, value in model_kwargs.items()
+                if argument.startswith("decoder_")
+            }
+        )
+
+        # forward pass on the encoder
+        encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
+        kwargs_decoder["encoder_hidden_states"] = tile(
+            encoder_outputs, self.beam_size, dim=0
+        )
+
+        # grow the beam by generating sequences in an autoregressive way
+        batch_size = encoder_input_ids.size(0)
+        self._init_beam_state(batch_size)
+        for step in range(self.max_length):
+            # prepare the decoder input
+            decoder_input = fit_to_block_size(
+                self.growing_beams, self.tokenizer.pad_token_id
+            )
+            kwargs_decoder["decoder_lm_labels"] = build_lm_labels(
+                decoder_input, self.tokenizer.pad_token_id
+            )
+            kwargs_decoder["decoder_attention_mask"] = build_mask(
+                decoder_input, self.tokenizer.pad_token_id
+            )
+
+            outputs = self.model.decoder(decoder_input, kwargs_decoder)
+            log_probabilities = torch.nn.functional.log_softmax(outputs[1])
+            surviving_beams_rows = self.grow(log_probabilities)
+            if self.is_done:
+                break
+
+            kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
+                "encoder_hidden_states"
+            ].index_select(0, surviving_beams_rows)
+            kwargs_decoder["encoder_attention_mask"] = kwargs_decoder[
+                "encoder_attention_mask"
+            ].index_select(0, surviving_beams_rows)
+
+        return self.results
+
+    def grow(self, log_probabilities):
+        """ Grow the beams by one step. """
+        self._step += 1
+
+        # The number of beams changes as some beams finish so we define _B
+        vocab_size = log_probabilities.size(-1)
+        _B = log_probabilities.size(0) // self.beam_size
+
+        # Multiply each beam probability with the probability of the
+        # next token (conditioned on the words in the beam).
+        log_probabilities += self.topk_log_probabilities.view(-1, 1)
+
+        self._enforce_min_length(log_probabilities)
+        if self.block_repeating_trigram:
+            self._remove_beams_with_repeating_trigrams(log_probabilities, _B)
+
+        # Find the `beam_size` (previous_beam + token) combinations with
+        # the highest score
+        topk_log_probabilities, topk_ids = torch.topk(
+            log_probabilities.view(_B, self.beam_size * vocab_size), self.beam_size, dim=1
+        )
+
+        # Apply the length penalty. The +1 accounts for the [EOS] token
+        # that will be added if the beam ends.
+        topk_scores = topk_log_probabilities
+        if self.apply_length_penalty:
+            topk_scores /= self._length_penalty()
+
+        # Retrieve the corresponding respective beam and token id
+        # topk_token_ids[i] will be added to topk_beam_ids[i]
+        topk_beam_ids = topk_ids.div(vocab_size)
+        topk_token_ids = topk_ids.fmod(vocab_size)
+
+        # Retrieve the row index of the surviving beams in the original
+        # view of the log_probabilities tensor
+        surviving_beams_per_batch = topk_beam_ids + self.beam_offset[:_B].view(-1, 1)
+        surviving_beams_rows = surviving_beams_per_batch.view(-1)
+
+        # Append the last predictions
+        self.growing_beams = torch.cat(
+            [
+                self.growing_beams.index_select(0, surviving_beams_rows),
+                topk_token_ids.view(-1, 1),
+            ],
+            1,
+        )
+
+        # Check if any of the beam searches has ended during this
+        # growth step. Also if top beam (most probable) has ended
+        # for one element of the batch.
+        is_finished = topk_token_ids.eq(self.eos_token_id)
+        self._enforce_max_length(is_finished)
+        if is_finished.any():
+            non_finished = self._cut_finished(is_finished, topk_scores)
+            self.batch_offset = self.batch_offset.index_select(0, non_finished)
+            surviving_beams_per_batch = surviving_beams_per_batch.index_select(
+                0, non_finished
+            )
+            self.topk_log_probabilities = self.topk_log_probabilities.index_select(
+                0, non_finished
+            )
+
+            surviving_beams_rows = surviving_beams_per_batch.view(-1)
+            self.growing_beams = self.growing_beams.index_select(0, surviving_beams_rows)
+
+        return surviving_beams_rows
+
+    def _cut_finished(self, is_finished, topk_scores):
+        """ Save the finished searches and cut the correponding sequences off
+        the beams. """
+        is_top_beam_finished = is_finished[:, 0].eq(True)
+
+        # Save the finished searches
+        predictions = self.growing_beams.view(
+            -1, self.beam_size, self.growing_beams.size(1)
+        )
+        for i in range(is_finished.size(0)):
+            if is_top_beam_finished[i]:
+                is_finished[i].fill_(1)
+            finished_hyp = is_finished[i].nonzero().view(-1)
+
+            # Store the finished beams as a (score, prediction) hypothesis.
+            b = self.batch_offset[i]
+            for j in finished_hyp:
+                self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
+
+            # If the batch reached the end, save the best hypotheses
+            # in terms of length-penalized score.
+            if is_top_beam_finished[i]:
+                best_score, best_prediction = max(self.hypotheses[b], key=lambda x: x[0])
+                self.results["scores"][b].append(best_score)
+                self.results["predictions"][b].append(best_prediction)
+
+        non_finished = is_top_beam_finished.eq(False).nonzero().view(-1)
+        if len(non_finished) == 0:
+            self.is_done = True
+
+        return non_finished
+
+    def _remove_beams_with_repeating_trigrams(self, log_probabilities, _B):
+        if self._step + 1 > 3:  # [BOS] does not count
+            for i in range(_B * self.beam_size):
+                tokens = self.growing_beams[i]
+                trigrams = [
+                    (tokens[j - 1], tokens[j], tokens[j + 1])
+                    for j in range(1, len(self) - 1)
+                ]
+                last_trigram = tuple(trigrams[-1])
+                if last_trigram in trigrams[:-1]:
+                    log_probabilities[i] = -1e20
+
+    def _enforce_min_length(self, log_probabilities):
+        if self._step < self.min_length:
+            log_probabilities[:, self.eos_token_id] = -1e20
+
+    def _enforce_max_length(self, is_finished):
+        # +1 because we will need to add an [EOS] token
+        if self._step + 1 == self.max_length:
+            is_finished.fill_(1)
+
+    def _length_penalty(self):
+        """ The calculation of the length penalty follows that of [1].
+
+        [1] Wu, Yonghui, et al. "Google's neural machine translation system:
+        Bridging the gap between human and machine translation." arXiv preprint
+        arXiv:1609.08144 (2016).
+        """
+        return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
+
+
+def tile(x, count, dim=0):
+    """
+    Tiles `x` along dimension `dim` `count` times.
+
+    Example:
+        >> ex = torch.tensor([1,2],[3,4])
+        >> tile(ex, 2, 0)
+        torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
+    """
+    perm = list(range(len(x.size())))
+    if dim != 0:
+        perm[0], perm[dim] = perm[dim], perm[0]
+        x = x.permute(perm).contiguous()
+    out_size = list(x.size())
+    out_size[0] *= count
+    batch = x.size(0)
+    x = (
+        x.view(batch, -1)
+        .transpose(0, 1)
+        .repeat(count, 1)
+        .transpose(0, 1)
+        .contiguous()
+        .view(*out_size)
+    )
+    if dim != 0:
+        x = x.permute(perm).contiguous()
+    return x
+
+
+def fit_to_block_size(sequence, block_size, pad_token_id):
+    """ Adapt the source and target sequences' lengths to the block size.
+    If the sequence is shorter we append padding tokens to the right.
+    """
+    if len(sequence) > block_size:
+        return sequence[:block_size]
+    else:
+        sequence.extend([pad_token_id] * (block_size - len(sequence)))
+        return sequence
+
+
+def build_lm_labels(sequence, pad_token_id):
+    """ Padding token, encoded as 0, are represented by the value -1 so they
+    are not taken into account in the loss computation. """
+    padded = sequence.clone()
+    padded[padded == pad_token_id] = -1
+    return padded
+
+
+def build_mask(sequence, pad_token_id):
+    """ Builds the mask. The attention mechanism will only attend to positions
+    with value 1. """
+    mask = torch.ones_like(sequence)
+    idx_pad_tokens = sequence == pad_token_id
+    mask[idx_pad_tokens] = 0
+    return mask
diff --git a/transformers/modeling_beam_search.py b/transformers/modeling_beam_search.py
deleted file mode 100644
index 171dcb7247..0000000000
--- a/transformers/modeling_beam_search.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 Yang Liu
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""
-A general wrapper around models with LM heads to generate sequences
-using beam search.
-"""
-import torch
-from torch import nn
-
-
-class TransformerBeamSearch(nn.Module):
-    def __init__(
-        self,
-        model,
-        tokenizer,
-        batch_size,
-        beam_size,
-        min_length,
-        max_length,
-        alpha=0,
-        block_repeating_trigram=True,
-    ):
-        """
-        Attributes:
-            mask_word_id: token id that corresponds to the mask
-        """
-        super(TransformerBeamSearch, self).__init__()
-        self.model = model
-        self.tokenizer = tokenizer
-
-        self.start_token_id = tokenizer.start_token_id
-        self.end_token_id = tokenizer.end_token_id
-        self.pad_token_id = tokenizer.pad_token_id
-
-        self.beam_size = beam_size
-        self.min_length = min_length
-        self.max_length = max_length
-
-        self.block_repeating_trigram = block_repeating_trigram
-        self.apply_length_penalty = False if alpha == 0 else True
-        self.alpha = alpha
-
-        # State of the beam
-        self.hypotheses = [[] for _ in range(batch_size)]
-        self.batch_offset = torch.arange(batch_size, dtype=torch.long)
-        self.beam_offset = torch.arange(
-            0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
-        )
-        self.growing_beam = torch.full(
-            (batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
-        )
-        self.topk_log_probabilities = torch.tensor(
-            [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
-        ).repeat(batch_size)
-        self.results = {
-            "prediction": [[] for _ in batch_size],
-            "scores": [[] for _ in batch_size],
-        }
-        self._step = 0
-        self.is_done = False
-
-    def step(self, log_probabilities):
-        """ Grows the beam by one step. """
-        self._step += 1
-
-        # The batch size changes as some beams finish so we define _B
-        vocab_size = log_probabilities.size(-1)
-        _B = log_probabilities.size(0) // self.beam_size
-
-        # Multiply each beam probability with the probability of the
-        # next token (conditioned on the words in the beam).
-        log_probabilities += self.topk_log_probabilities.view(-1, 1)
-
-        self.enforce_min_length(log_probabilities)
-        if self.block_repeating_trigram:
-            self.remove_repeating_trigrams(log_probabilities, _B)
-
-        # Find the `beam_size` (previous_beam + token) combinations with
-        # the highest score
-        topk_log_probabilities, topk_ids = log_probabilities.topk(
-            log_probabilities.view(_B, self.beam_size * vocab_size),
-            self.beam_size,
-            dim=1,
-        )
-
-        # Apply the length penalty. The +1 accounts for the [EOS] token
-        # that will be added if the beam ends.
-        topk_scores = topk_log_probabilities / self.length_penalty()
-
-        # Retrieve the corresponding respective beam and token id
-        # topk_token_ids[i] will be added to topk_beam_ids[i]
-        topk_beam_ids = topk_ids.div(vocab_size)
-        topk_token_ids = topk_ids.fmod(vocab_size)
-
-        # Retrieve the row index of the surviving beams in the original
-        # view of the log_probabilities tensor
-        surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view(
-            -1
-        )
-
-        # Append the last predictions
-        self.growing_beam = torch.cat(
-            [
-                self.growing_beam.index_select(0, surviving_beams_rows),
-                topk_token_ids.view(-1, 1),
-            ],
-            1,
-        )
-
-        # Check if any of the beam searches has ended during this
-        # growth step. Also if top beam (most probable) has ended
-        # for one element of the batch.
-        is_finished = topk_token_ids.eq(self.end_token_id)
-        self.enforce_max_length()
-        is_top_beam_finished = is_finished[:, 0].eq(1)
-
-        # Save the finished searches
-        if is_finished.any():
-            predictions = self.growing_beam.view(
-                -1, self.beam_size, self.growing_beam.size(1)
-            )
-            for i in range(is_finished.size(0)):
-                if is_top_beam_finished[i]:
-                    is_finished[i].fill_(1)
-                finished_hyp = is_finished[i].nonzero().view(-1)
-
-                # Store finished hypotheses for this batch.
-                b = self.batch_offset[i]
-                for j in finished_hyp:
-                    self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
-
-                # If the batch reached the end, save the best hypotheses
-                # in terms of length-penalized score.
-                if is_top_beam_finished[i]:
-                    best_hyp = sorted(
-                        self.hypotheses[b], key=lambda x: x[0], reverse=True
-                    )
-                    best_score, best_prediction = best_hyp[0]
-                    self.results["scores"][b].append(best_score)
-                    self.results["predictions"][b].append(best_prediction)
-
-            non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
-            if len(non_finished) == 0:
-                self.is_done = True
-
-            # Remove finished batches for the next step.
-            topk_log_probabilities = topk_log_probabilities.index_select(
-                0, non_finished
-            )
-            self.batch_offset = self.batch_offset.index_select(0, non_finished)
-            self.growing_beam = predictions.index_select(0, non_finished).view(
-                -1, self.growing_beam.size(-1)
-            )
-
-            surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
-
-        return surviving_beams_rows
-
-    def forward(self, encoder_input_ids, **kwargs):
-        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
-        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
-        # that apply to the model as whole.
-        # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_encoder = {
-            argument[len("encoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("encoder_")
-        }
-        kwargs_decoder = {
-            argument[len("decoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("decoder_")
-        }
-        kwargs_common = {
-            argument: value
-            for argument, value in kwargs.items()
-            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
-        }
-        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
-        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
-
-        # forward pass on the encoder
-        encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
-        kwargs_decoder["encoder_hidden_states"] = tile(
-            encoder_outputs, self.beam_size, dim=0
-        )
-
-        # grow the beam by generating sequences in an autoregressive way
-        self.growing_beam = torch.full(
-            (self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
-        )
-        for step in range(self.max_length):
-            decoder_input = self.growing_beam[:, -1]
-            outputs = self.model.decoder(decoder_input, kwargs_decoder)
-            log_probabilities = torch.nn.functional.log_softmax(outputs[1])
-            surviving_beams_rows = self.step(log_probabilities)
-            if self.is_done:
-                break
-
-            kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
-                "encoder_hidden_states"
-            ].index_select(0, surviving_beams_rows)
-
-        return self.results
-
-    def remove_repeating_trigrams(self, log_probabilities, _B):
-        if(self._step + 1 > 3):
-            for i in range(_B * self.beam_size):
-                tokens = [t for t in self.growing_beam[i]]
-                trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
-                last_trigram = tuple(trigrams[-1])
-                if last_trigram in trigrams[:-1]:
-                    log_probabilities[i] = -1e20
-
-    def enforce_min_length(self):
-        if self._step < self.min_length:
-            self.log_probabilities[self.end_token_id] = -1e20
-
-    def enforce_max_length(self):
-        if self._step + 1 == self.max_length:
-            self.is_finished.fill_(1)
-
-    def length_penalty(self):
-        return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
-
-
-def tile(x, count, dim=0):
-    """
-    Tiles `x` along dimension `dim` `count` times.
-
-    Example:
-        >> ex = torch.tensor([1,2],[3,4])
-        >> tile(ex, 2, 0)
-        torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
-    """
-    perm = list(range(len(x.size())))
-    if dim != 0:
-        perm[0], perm[dim] = perm[dim], perm[0]
-        x = x.permute(perm).contiguous()
-    out_size = list(x.size())
-    out_size[0] *= count
-    batch = x.size(0)
-    x = (
-        x.view(batch, -1)
-        .transpose(0, 1)
-        .repeat(count, 1)
-        .transpose(0, 1)
-        .contiguous()
-        .view(*out_size)
-    )
-    if dim != 0:
-        x = x.permute(perm).contiguous()
-    return x
diff --git a/transformers/tests/beam_search_tests.py b/transformers/tests/beam_search_tests.py
new file mode 100644
index 0000000000..a92ebf3578
--- /dev/null
+++ b/transformers/tests/beam_search_tests.py
@@ -0,0 +1,226 @@
+from collections import namedtuple
+import unittest
+
+import numpy as np
+import torch
+
+from transformers.generate import BeamSearch
+from transformers import PreTrainedEncoderDecoder
+
+
+StubTokenizer = namedtuple("Tokenizer", ["bos_token_id", "eos_token_id", "pad_token_id"])
+StubTransformer = namedtuple("Transformer", ["encoder", "decoder"])
+
+
+class BeamSearchtest(unittest.TestCase):
+    def test_beam_search_encoder_decoder_integration(self):
+        """ We make sure that no internal change in the PreTrainedEncoderDecoder
+        class will break the integration with the beam search.
+        """
+
+        model = PreTrainedEncoderDecoder("encoder", "decoder")
+        tokenizer = StubTokenizer(0, 1, 2)
+        try:
+            _ = BeamSearch(
+                model=model,
+                tokenizer=tokenizer,
+                batch_size=1,
+                beam_size=1,
+                min_length=1,
+                max_length=1,
+                alpha=0,
+                block_repeating_trigrams=False,
+            )
+        except:
+            self.fail("Instantiating BeamSearch with a PreTrainedEncoderDecoder failed.")
+
+    def test_beam_search_min_length(self):
+        """ We keep predicting the end_token for the first beam and check that
+        it is not marked as finished until the beam has reached the minimum
+        length. """
+        eos_idx = 3
+        vocab_size = 10
+
+        batch_size = 3
+        beam_size = 2
+        min_length = 5
+
+        beam = BeamSearch(
+            model=StubTransformer("encoder", "decoder"),
+            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=eos_idx, pad_token_id=2),
+            batch_size=batch_size,
+            beam_size=beam_size,
+            min_length=5,
+            max_length=10,
+            alpha=0,
+            block_repeating_trigrams=False,
+        )
+
+        # To test that the minimum length is correctly enforced we constantly
+        # assign the highest probability to the [EOS] token (and assign lower
+        # probabilities to some other tokens).
+        # Since BeamSearch will reset its probability to 1e-20 as long as
+        # min_length has not been reached, we need to reset the value between
+        # steps.
+        non_eos_idxs = [4, 5, 1, 8, 9]
+        score_distribution = torch.log_softmax(
+            torch.tensor([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]), dim=0
+        )
+
+        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
+        log_probabilities[0, eos_idx] = score_distribution[0]
+        for idx, score in zip(non_eos_idxs, score_distribution[1:]):
+            log_probabilities[0, idx] = score
+
+        for step in range(1, min_length + 2):
+            log_probabilities[0, eos_idx] = score_distribution[0]
+
+            # Beam #3 and #4 teminate at the first step since the probability
+            # of the [EOS] token is -1e20 > -\infty so there are only two beams left.
+            surviving_beams_rows = beam.grow(log_probabilities)
+            if step < min_length:
+                np.testing.assert_array_equal(
+                    beam.growing_beams.numpy(),
+                    np.repeat(np.array([[0] + [4] * step]), 2, axis=0),
+                )
+            elif step == min_length:
+                np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([]))
+                self.assertTrue(beam.is_done)
+                break
+
+            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
+
+    def test_beam_search_max_length(self):
+        """ We keep predicting the same non-EOS token until we reach the
+        maximum permitted length """
+        batch_size = 3
+        beam_size = 2
+        max_length = 5
+        vocab_size = 10
+
+        beam = BeamSearch(
+            model=StubTransformer("encoder", "decoder"),
+            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=1, pad_token_id=2),
+            batch_size=batch_size,
+            beam_size=beam_size,
+            min_length=2,
+            max_length=max_length,
+            alpha=0,
+            block_repeating_trigrams=False,
+        )
+
+        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
+
+        # To test that beam search enforces the max length constraint we
+        # keep giving the highest probability to a token that is not the
+        # [EOS] token.
+        # The beam search will stop at max_length-1, assuming that one would
+        # add the [EOS] token at the end of the returned sequence.
+        token_idxs = [3, 4, 5]
+        score_distribution = torch.log_softmax(torch.tensor([10.0, 6.0, 4.0]), dim=0)
+        for idx, score in zip(token_idxs, score_distribution):
+            log_probabilities[:, idx] = score
+
+        for step in range(1, max_length + 2):
+            surviving_beams_rows = beam.grow(log_probabilities)
+            if step + 1 < max_length:
+                self.assertFalse(beam.is_done)
+            elif step + 1 == max_length:  # Now [EOS] is the most probable token
+                np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([]))
+                self.assertTrue(beam.is_done)
+                break
+
+            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
+
+    def test_beam_search_block_repeating_trigrams(self):
+        """ We make sure that the beams that contain repeating trigrams are removed. """
+        batch_size = 3
+        beam_size = 2
+        max_length = 10
+        vocab_size = 10
+
+        beam = BeamSearch(
+            model=StubTransformer("encoder", "decoder"),
+            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=1, pad_token_id=2),
+            batch_size=batch_size,
+            beam_size=beam_size,
+            min_length=2,
+            max_length=max_length,
+            alpha=0,
+            block_repeating_trigrams=True,
+        )
+
+        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
+
+        # To test that BeamSearch enforces the 3-gram constraint we give the
+        # highest probably to the same tokens in a cyclic fashion and make sure
+        # they disappear once the cycle has completed.
+        token_idxs = [3, 4, 5]
+        score_distribution = torch.log_softmax(torch.tensor([10.0, 6.0, 4.0]), dim=0)
+        for idx, score in zip(token_idxs, score_distribution):
+            log_probabilities[:, idx] = score
+
+        for step in range(1, max_length + 2):
+            # Rotate the probabilities at each step
+            for idx in token_idxs:
+                score = score_distribution[(idx + step) % 3]
+                log_probabilities[::beam_size, idx] = score
+
+            surviving_beams_rows = beam.grow(log_probabilities)
+            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
+
+            if step < 7:
+                self.assertFalse(
+                    np.array_equal(
+                        log_probabilities.numpy()[0, :],
+                        np.array([-1e20] * vocab_size, dtype="float32"),
+                    )
+                )
+            if step == 7:
+                np.testing.assert_array_equal(
+                    log_probabilities.numpy()[0, :],
+                    np.array([-1e20] * vocab_size, dtype="float32"),
+                )
+
+    def test_beam_search_example_for_one_step(self):
+        """ We test that the predictions for one step of growth are correct. """
+        batch_size = 2
+        beam_size = 2
+        max_length = 10
+        vocab_size = 5
+
+        beam = BeamSearch(
+            model=StubTransformer("encoder", "decoder"),
+            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=1, pad_token_id=2),
+            batch_size=batch_size,
+            beam_size=beam_size,
+            min_length=2,
+            max_length=max_length,
+            alpha=0,
+            block_repeating_trigrams=False,
+        )
+
+        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
+        log_probabilities[0, 3:] = torch.log_softmax(torch.tensor([2.0, 1.0]), dim=0)
+        log_probabilities[2, 3:] = torch.log_softmax(torch.tensor([1.0, 2.0]), dim=0)
+
+        # First pass
+        surviving_beams_rows = beam.grow(log_probabilities)
+        np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([0, 0, 2, 2]))
+        np.testing.assert_array_equal(
+            beam.growing_beams.numpy(), np.array([[0, 3], [0, 4], [0, 4], [0, 3]])
+        )
+        self.assertFalse(beam.is_done)
+
+        # Second pass
+        surviving_beams_rows = beam.grow(log_probabilities)
+        np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([0, 0, 2, 2]))
+        np.testing.assert_array_equal(
+            beam.growing_beams.numpy(),
+            np.array([[0, 3, 3], [0, 3, 4], [0, 4, 4], [0, 4, 3]]),
+        )
+        self.assertFalse(beam.is_done)
+
+
+if __name__ == "__name__":
+    unittest.main()

From ba089c780b918414bd8b669e1764fed728753edf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 6 Nov 2019 13:55:24 +0100
Subject: [PATCH 50/91] share pretrained embeddings

---
 examples/utils_summarization.py      | 11 +---
 requirements.txt                     |  4 +-
 transformers/generate/beam_search.py | 87 ++++++++++++++++++----------
 3 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/examples/utils_summarization.py b/examples/utils_summarization.py
index 7cbd4cd61b..8e95a04e19 100644
--- a/examples/utils_summarization.py
+++ b/examples/utils_summarization.py
@@ -136,18 +136,11 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
     as specified in [1] by using `[SEP] [CLS]` tokens to separate
     sentences.
     """
-    story_lines_token_ids = [
-        tokenizer.build_inputs_with_special_tokens(tokenizer.encode(line))
-        for line in story_lines
-    ]
-    summary_lines_token_ids = [
-        tokenizer.build_inputs_with_special_tokens(tokenizer.encode(line))
-        for line in summary_lines
-    ]
-
+    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
     story_token_ids = [
         token for sentence in story_lines_token_ids for token in sentence
     ]
+    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
     summary_token_ids = [
         token for sentence in summary_lines_token_ids for token in sentence
     ]
diff --git a/requirements.txt b/requirements.txt
index 9c43abc6d7..060aba915d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,6 @@ regex
 # For XLNet
 sentencepiece
 # For XLM
-sacremoses
\ No newline at end of file
+sacremoses
+# For ROUGE
+pyrouge
diff --git a/transformers/generate/beam_search.py b/transformers/generate/beam_search.py
index 09e340a150..e1b2d23da0 100644
--- a/transformers/generate/beam_search.py
+++ b/transformers/generate/beam_search.py
@@ -26,27 +26,31 @@ Use Beam Search to generate sequences using encoder-decoder models.
 import torch
 from torch import nn
 
+import logging
+
+
+logger = logging.getLogger(__name__)
+
 
 class BeamSearch(nn.Module):
     def __init__(
         self,
         model,
-        tokenizer,
+        bos_token_id,
+        pad_token_id,
+        eos_token_id,
+        batch_size,
         beam_size,
         min_length,
         max_length,
-        batch_size=1,
         alpha=0,
         block_repeating_trigrams=True,
+        device=torch.device("cpu"),
     ):
         r"""
         Inputs:
             **model**: instance of ``transformers.PreTrainedEncoderDecoder``
                 The pretrained encoder-decoder model that will be used to generate the sequences.
-            **tokenizer**: instance of ``transformers.PreTrainedTokenizer``
-                The pretrained tokenizer associated to the model used in the encoder-decoder. We only
-                support encoder-decoder that use the same tokenizer for encoder and decoder. The tokenizer
-                needs to be initialized or this function will raise and exception.
             **batch_size**: (`optional`) int
                 Batch size of the inputs. The value is set automatically when calling `forward`.
             **beam_size**: int
@@ -64,11 +68,11 @@ class BeamSearch(nn.Module):
         """
         super(BeamSearch, self).__init__()
         self.model = model
-        self.tokenizer = tokenizer
+        self.device = device
 
-        self.bos_token_id = tokenizer.bos_token_id
-        self.eos_token_id = tokenizer.eos_token_id
-        self.pad_token_id = tokenizer.pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
 
         self.batch_size = batch_size
         self.beam_size = beam_size
@@ -90,15 +94,24 @@ class BeamSearch(nn.Module):
     def _init_beam_state(self, batch_size):
         """ (re-)Initialize the state of the beams. """
         self.hypotheses = [[] for _ in range(batch_size)]
-        self.batch_offset = torch.arange(batch_size, dtype=torch.long)
+        self.batch_offset = torch.arange(batch_size, dtype=torch.long, device=self.device)
         self.beam_offset = torch.arange(
-            0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
+            0,
+            batch_size * self.beam_size,
+            step=self.beam_size,
+            dtype=torch.long,
+            device=self.device,
         )
         self.growing_beams = torch.full(
-            (batch_size * self.beam_size, 1), self.bos_token_id, dtype=torch.long
+            (batch_size * self.beam_size, 1),
+            self.bos_token_id,
+            dtype=torch.long,
+            device=self.device,
         )
         self.topk_log_probabilities = torch.tensor(
-            [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
+            [0.0] + [float("-inf")] * (self.beam_size - 1),
+            dtype=torch.float,
+            device=self.device,
         ).repeat(batch_size)
         self.results = {
             "predictions": [[] for _ in range(batch_size)],
@@ -136,28 +149,37 @@ class BeamSearch(nn.Module):
         )
 
         # forward pass on the encoder
-        encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
+        encoder_outputs = self.model.encoder(encoder_input_ids, **kwargs_encoder)
+        encoder_hidden_states = encoder_outputs[0]
         kwargs_decoder["encoder_hidden_states"] = tile(
-            encoder_outputs, self.beam_size, dim=0
+            encoder_hidden_states, self.beam_size, dim=0
+        )
+        kwargs_decoder["encoder_attention_mask"] = tile(
+            kwargs_encoder["attention_mask"], self.beam_size, dim=0
         )
 
         # grow the beam by generating sequences in an autoregressive way
-        batch_size = encoder_input_ids.size(0)
+        batch_size, block_size = encoder_input_ids.size()
         self._init_beam_state(batch_size)
         for step in range(self.max_length):
-            # prepare the decoder input
-            decoder_input = fit_to_block_size(
-                self.growing_beams, self.tokenizer.pad_token_id
-            )
-            kwargs_decoder["decoder_lm_labels"] = build_lm_labels(
-                decoder_input, self.tokenizer.pad_token_id
-            )
-            kwargs_decoder["decoder_attention_mask"] = build_mask(
-                decoder_input, self.tokenizer.pad_token_id
+            # Add padding tokens
+            decoder_input = torch.full(
+                (self.growing_beams.size(0), block_size),
+                self.pad_token_id,
+                dtype=torch.long,
+                device=self.growing_beams.device,
             )
+            decoder_input[:, : self.growing_beams.size(1)] = self.growing_beams
 
-            outputs = self.model.decoder(decoder_input, kwargs_decoder)
-            log_probabilities = torch.nn.functional.log_softmax(outputs[1])
+            # compute decoder_attention_mask
+            decoder_mask = torch.ones_like(decoder_input)
+            idx_pad_tokens = decoder_input == self.pad_token_id
+            decoder_mask[idx_pad_tokens] = 0
+            kwargs_decoder["attention_mask"] = decoder_mask
+
+            outputs = self.model.decoder(decoder_input, **kwargs_decoder)
+            last_token_scores = outputs[0][:, -1, :].squeeze(1)
+            log_probabilities = torch.nn.functional.log_softmax(last_token_scores, dim=0)
             surviving_beams_rows = self.grow(log_probabilities)
             if self.is_done:
                 break
@@ -189,13 +211,13 @@ class BeamSearch(nn.Module):
 
         # Find the `beam_size` (previous_beam + token) combinations with
         # the highest score
-        topk_log_probabilities, topk_ids = torch.topk(
+        self.topk_log_probabilities, topk_ids = torch.topk(
             log_probabilities.view(_B, self.beam_size * vocab_size), self.beam_size, dim=1
         )
 
         # Apply the length penalty. The +1 accounts for the [EOS] token
         # that will be added if the beam ends.
-        topk_scores = topk_log_probabilities
+        topk_scores = self.topk_log_probabilities
         if self.apply_length_penalty:
             topk_scores /= self._length_penalty()
 
@@ -337,8 +359,9 @@ def fit_to_block_size(sequence, block_size, pad_token_id):
     if len(sequence) > block_size:
         return sequence[:block_size]
     else:
-        sequence.extend([pad_token_id] * (block_size - len(sequence)))
-        return sequence
+        return torch.cat(
+            (sequence, torch.tensor([pad_token_id] * (block_size - len(sequence)))), dim=0
+        )
 
 
 def build_lm_labels(sequence, pad_token_id):

From 4735c2af0715c24d47b34c167fb7d5543493b87d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 8 Nov 2019 11:16:26 +0100
Subject: [PATCH 51/91] tweaks to the BeamSearch API

---
 transformers/generate/beam_search.py    | 63 ++++++++++---------------
 transformers/tests/beam_search_tests.py | 53 ++++++++++++++-------
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/transformers/generate/beam_search.py b/transformers/generate/beam_search.py
index e1b2d23da0..a18d20f31a 100644
--- a/transformers/generate/beam_search.py
+++ b/transformers/generate/beam_search.py
@@ -32,7 +32,7 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-class BeamSearch(nn.Module):
+class BeamSearch(object):
     def __init__(
         self,
         model,
@@ -45,12 +45,17 @@ class BeamSearch(nn.Module):
         max_length,
         alpha=0,
         block_repeating_trigrams=True,
-        device=torch.device("cpu"),
     ):
         r"""
         Inputs:
             **model**: instance of ``transformers.PreTrainedEncoderDecoder``
                 The pretrained encoder-decoder model that will be used to generate the sequences.
+            **bos_token_id**: int
+                Id that is used by the tokenizer to represent the beggining of a sentence.
+            **pad_token_id**: int
+                Id that is used by the tokenizer for padding.
+            **eos_token_id**: int
+                Id that is used by the tokenizer to represent the end of a sentence.
             **batch_size**: (`optional`) int
                 Batch size of the inputs. The value is set automatically when calling `forward`.
             **beam_size**: int
@@ -68,7 +73,7 @@ class BeamSearch(nn.Module):
         """
         super(BeamSearch, self).__init__()
         self.model = model
-        self.device = device
+        self.device = next(model.parameters()).device  # only works if all parameters of the model are stored on a single GPU
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
@@ -86,10 +91,7 @@ class BeamSearch(nn.Module):
         self._init_beam_state(batch_size)
 
     def __len__(self):
-        try:
-            return self.growing_beams.size(1)
-        except NameError:
-            return 0
+        return self.growing_beams.size(1)
 
     def _init_beam_state(self, batch_size):
         """ (re-)Initialize the state of the beams. """
@@ -120,7 +122,7 @@ class BeamSearch(nn.Module):
         self._step = 0
         self.is_done = False
 
-    def forward(self, encoder_input_ids, **model_kwargs):
+    def __call__(self, encoder_input_ids, **model_kwargs):
         """ Generate a sequence using Beam Search. """
         # keyword arguments come in 3 flavors: encoder-specific (prefixed by
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
@@ -158,28 +160,17 @@ class BeamSearch(nn.Module):
             kwargs_encoder["attention_mask"], self.beam_size, dim=0
         )
 
-        # grow the beam by generating sequences in an autoregressive way
+        # grow the beam iteratively
         batch_size, block_size = encoder_input_ids.size()
         self._init_beam_state(batch_size)
         for step in range(self.max_length):
-            # Add padding tokens
-            decoder_input = torch.full(
-                (self.growing_beams.size(0), block_size),
-                self.pad_token_id,
-                dtype=torch.long,
-                device=self.growing_beams.device,
-            )
-            decoder_input[:, : self.growing_beams.size(1)] = self.growing_beams
-
-            # compute decoder_attention_mask
-            decoder_mask = torch.ones_like(decoder_input)
-            idx_pad_tokens = decoder_input == self.pad_token_id
-            decoder_mask[idx_pad_tokens] = 0
-            kwargs_decoder["attention_mask"] = decoder_mask
 
+            decoder_input = fit_to_block_size(self.growing_beams, block_size, self.pad_token_id)
+            kwargs_decoder["attention_mask"] = build_mask(decoder_input)
             outputs = self.model.decoder(decoder_input, **kwargs_decoder)
-            last_token_scores = outputs[0][:, -1, :].squeeze(1)
-            log_probabilities = torch.nn.functional.log_softmax(last_token_scores, dim=0)
+
+            next_token_scores = outputs[0][:, -1, :].squeeze(1)
+            log_probabilities = torch.nn.functional.log_softmax(next_token_scores, dim=0)
             surviving_beams_rows = self.grow(log_probabilities)
             if self.is_done:
                 break
@@ -356,20 +347,14 @@ def fit_to_block_size(sequence, block_size, pad_token_id):
     """ Adapt the source and target sequences' lengths to the block size.
     If the sequence is shorter we append padding tokens to the right.
     """
-    if len(sequence) > block_size:
-        return sequence[:block_size]
-    else:
-        return torch.cat(
-            (sequence, torch.tensor([pad_token_id] * (block_size - len(sequence)))), dim=0
-        )
-
-
-def build_lm_labels(sequence, pad_token_id):
-    """ Padding token, encoded as 0, are represented by the value -1 so they
-    are not taken into account in the loss computation. """
-    padded = sequence.clone()
-    padded[padded == pad_token_id] = -1
-    return padded
+    padded_sequence = torch.full(
+        (sequence.size(0), block_size),
+        pad_token_id,
+        dtype=torch.long,
+        device=sequence.device,
+    )
+    padded_sequence[:, : sequence.size(1)] = sequence
+    return sequence
 
 
 def build_mask(sequence, pad_token_id):
diff --git a/transformers/tests/beam_search_tests.py b/transformers/tests/beam_search_tests.py
index a92ebf3578..6f2a2b9c2f 100644
--- a/transformers/tests/beam_search_tests.py
+++ b/transformers/tests/beam_search_tests.py
@@ -1,15 +1,22 @@
 from collections import namedtuple
 import unittest
-
+import pytest
 import numpy as np
 import torch
+from torch import nn
 
 from transformers.generate import BeamSearch
 from transformers import PreTrainedEncoderDecoder
 
 
-StubTokenizer = namedtuple("Tokenizer", ["bos_token_id", "eos_token_id", "pad_token_id"])
-StubTransformer = namedtuple("Transformer", ["encoder", "decoder"])
+class StubTransformer(nn.Module):
+    def __init__(self):
+        self.encoder = None
+        self.decoder = None
+        self._parameters = {"dumy": torch.tensor([1])}
+
+    def forward(self):
+        pass
 
 
 class BeamSearchtest(unittest.TestCase):
@@ -18,12 +25,13 @@ class BeamSearchtest(unittest.TestCase):
         class will break the integration with the beam search.
         """
 
-        model = PreTrainedEncoderDecoder("encoder", "decoder")
-        tokenizer = StubTokenizer(0, 1, 2)
+        model = StubTransformer()
         try:
             _ = BeamSearch(
                 model=model,
-                tokenizer=tokenizer,
+                bos_token_id=0,
+                eos_token_id=1,
+                pad_token_id=2,
                 batch_size=1,
                 beam_size=1,
                 min_length=1,
@@ -46,8 +54,10 @@ class BeamSearchtest(unittest.TestCase):
         min_length = 5
 
         beam = BeamSearch(
-            model=StubTransformer("encoder", "decoder"),
-            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=eos_idx, pad_token_id=2),
+            model=StubTransformer(),
+            bos_token_id=0,
+            eos_token_id=eos_idx,
+            pad_token_id=2,
             batch_size=batch_size,
             beam_size=beam_size,
             min_length=5,
@@ -71,17 +81,17 @@ class BeamSearchtest(unittest.TestCase):
         log_probabilities[0, eos_idx] = score_distribution[0]
         for idx, score in zip(non_eos_idxs, score_distribution[1:]):
             log_probabilities[0, idx] = score
-
+        pytest.set_trace()
         for step in range(1, min_length + 2):
             log_probabilities[0, eos_idx] = score_distribution[0]
 
             # Beam #3 and #4 teminate at the first step since the probability
             # of the [EOS] token is -1e20 > -\infty so there are only two beams left.
+            # The top beam (most likely) always ends with 4 until we reach min_length.
             surviving_beams_rows = beam.grow(log_probabilities)
             if step < min_length:
                 np.testing.assert_array_equal(
-                    beam.growing_beams.numpy(),
-                    np.repeat(np.array([[0] + [4] * step]), 2, axis=0),
+                    beam.growing_beams.numpy()[0, :], np.array([0] + [4] * step)
                 )
             elif step == min_length:
                 np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([]))
@@ -99,8 +109,10 @@ class BeamSearchtest(unittest.TestCase):
         vocab_size = 10
 
         beam = BeamSearch(
-            model=StubTransformer("encoder", "decoder"),
-            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=1, pad_token_id=2),
+            model=StubTransformer(),
+            bos_token_id=0,
+            eos_token_id=1,
+            pad_token_id=2,
             batch_size=batch_size,
             beam_size=beam_size,
             min_length=2,
@@ -140,8 +152,10 @@ class BeamSearchtest(unittest.TestCase):
         vocab_size = 10
 
         beam = BeamSearch(
-            model=StubTransformer("encoder", "decoder"),
-            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=1, pad_token_id=2),
+            model=StubTransformer(),
+            bos_token_id=0,
+            eos_token_id=1,
+            pad_token_id=2,
             batch_size=batch_size,
             beam_size=beam_size,
             min_length=2,
@@ -167,7 +181,6 @@ class BeamSearchtest(unittest.TestCase):
                 log_probabilities[::beam_size, idx] = score
 
             surviving_beams_rows = beam.grow(log_probabilities)
-            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
 
             if step < 7:
                 self.assertFalse(
@@ -182,6 +195,8 @@ class BeamSearchtest(unittest.TestCase):
                     np.array([-1e20] * vocab_size, dtype="float32"),
                 )
 
+            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
+
     def test_beam_search_example_for_one_step(self):
         """ We test that the predictions for one step of growth are correct. """
         batch_size = 2
@@ -190,8 +205,10 @@ class BeamSearchtest(unittest.TestCase):
         vocab_size = 5
 
         beam = BeamSearch(
-            model=StubTransformer("encoder", "decoder"),
-            tokenizer=StubTokenizer(bos_token_id=0, eos_token_id=1, pad_token_id=2),
+            model=StubTransformer(),
+            bos_token_id=0,
+            eos_token_id=1,
+            pad_token_id=2,
             batch_size=batch_size,
             beam_size=beam_size,
             min_length=2,

From 9f75565ea8243ec685c3e5dd08a63e8f78af9d0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 8 Nov 2019 15:48:31 +0100
Subject: [PATCH 52/91] setup training

---
 requirements.txt                     | 2 --
 transformers/generate/beam_search.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 060aba915d..4a3162adce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,5 +10,3 @@ regex
 sentencepiece
 # For XLM
 sacremoses
-# For ROUGE
-pyrouge
diff --git a/transformers/generate/beam_search.py b/transformers/generate/beam_search.py
index a18d20f31a..abe3186049 100644
--- a/transformers/generate/beam_search.py
+++ b/transformers/generate/beam_search.py
@@ -166,7 +166,7 @@ class BeamSearch(object):
         for step in range(self.max_length):
 
             decoder_input = fit_to_block_size(self.growing_beams, block_size, self.pad_token_id)
-            kwargs_decoder["attention_mask"] = build_mask(decoder_input)
+            kwargs_decoder["attention_mask"] = build_mask(decoder_input, self.pad_token_id)
             outputs = self.model.decoder(decoder_input, **kwargs_decoder)
 
             next_token_scores = outputs[0][:, -1, :].squeeze(1)

From 4d1819990294f27ab1cf0113034f52cdb4136eaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 12 Nov 2019 17:59:34 +0100
Subject: [PATCH 53/91] cast bool tensor to long for pytorch < 1.3

---
 transformers/modeling_bert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 1ee3e3f097..0159d58aab 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -675,6 +675,7 @@ class BertModel(BertPreTrainedModel):
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = causal_mask.to(torch.long)  # not converting to long will cause errors with pytorch version < 1.3
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]

From 2403a6659859ad18a9f20e1c2e84179718d8dfd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Sat, 23 Nov 2019 00:18:44 +0100
Subject: [PATCH 54/91] give transformers API to BertAbs

---
 ..._original_pytorch_checkpoint_to_pytorch.py |  161 +++
 .../summarization/configuration_bertabs.py    |  141 ++
 ...ert_bertabs_original_pytorch_checkpoint.py |  162 +++
 examples/summarization/modeling_bertabs.py    | 1250 +++++++++++++++++
 examples/summarization/run_summarization.py   |  271 ++++
 .../utils_summarization.py                    |   56 +-
 .../utils_summarization_test.py               |   17 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  158 +++
 transformers/generate/beam_search.py          |   26 +-
 9 files changed, 2188 insertions(+), 54 deletions(-)
 create mode 100644 examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 examples/summarization/configuration_bertabs.py
 create mode 100644 examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
 create mode 100644 examples/summarization/modeling_bertabs.py
 create mode 100644 examples/summarization/run_summarization.py
 rename examples/{ => summarization}/utils_summarization.py (77%)
 rename examples/{ => summarization}/utils_summarization_test.py (88%)
 create mode 100644 transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py

diff --git a/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py b/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000..c245d0eae5
--- /dev/null
+++ b/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert BertExtAbs's checkpoints """
+
+import argparse
+from collections import namedtuple
+import logging
+import pdb
+import torch
+
+from models.model_builder import AbsSummarizer  # The authors' implementation
+from model_bertabs import BertAbsSummarizer
+
+from transformers import BertTokenizer
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+SAMPLE_TEXT = 'Hello world! cécé herlolip'
+
+
+BertAbsConfig = namedtuple(
+    "BertAbsConfig",
+    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+)
+
+
+def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
+    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    of BertAbs for the internal architecture.
+    """
+
+    # Instantiate the authors' model with the pre-trained weights
+    config = BertAbsConfig(
+        temp_dir=".",
+        finetune_bert=False,
+        large=False,
+        share_emb=True,
+        use_bert_emb=False,
+        encoder="bert",
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+    )
+    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
+    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
+    original.eval()
+
+    new_model = BertAbsSummarizer(config, torch.device("cpu"))
+    new_model.eval()
+
+    # -------------------
+    # Convert the weights
+    # -------------------
+
+    logging.info("convert the model")
+    new_model.encoder.load_state_dict(original.bert.state_dict())
+
+    new_model.decoder.generator.load_state_dict(original.generator.state_dict())
+    new_model.decoder.embeddings.load_state_dict(original.decoder.embeddings.state_dict())
+    new_model.decoder.pos_emb.load_state_dict(original.decoder.pos_emb.state_dict())
+    new_model.decoder.transformer_layers.load_state_dict(original.decoder.transformer_layers.state_dict())
+    new_model.decoder.layer_norm.load_state_dict(original.decoder.layer_norm.state_dict())
+
+    # ----------------------------------
+    # Make sure the outpus are identical
+    # ----------------------------------
+
+    logging.info("Make sure that the models' outputs are identical")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    # prepare the model inputs
+    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
+    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
+    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
+    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
+    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
+    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
+
+    # failsafe to make sure the weights reset does not affect the
+    # loaded weights.
+    assert torch.max(torch.abs(original.generator[0].weight - new_model.decoder.generator[0].weight)) == 0
+
+    # forward pass
+    src = encoder_input_ids
+    tgt = decoder_input_ids
+    segs = token_type_ids = None
+    clss = None
+    mask_src = encoder_attention_mask = None
+    mask_tgt = decoder_attention_mask = None
+    mask_cls = None
+    
+    # The original model does not apply the geneator layer immediatly but rather in
+    # the beam search (where it combines softmax + linear layer). Since we already
+    # apply the softmax in our generation process we only apply the linear layer here.
+    # We make sure that the outputs of the full stack are identical
+    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
+    output_original_model = original.generator(output_original_model)
+
+    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
+    output_converted_model = torch.nn.functional.log_softmax(output_converted_model, dim=-1)
+
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+
+    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
+    if are_identical:
+        logging.info("all weights are equal up to 1e-3")
+    else:
+        raise ValueError("the weights are different. The new model is likely different from the original one.")
+
+    # The model has been saved with torch.save(model) and this is bound to the exact
+    # directory structure. We save the state_dict instead.
+    logging.info("saving the model's state dictionary")
+    torch.save(new_model.state_dict(), "bert-ext-abs.pt")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bertabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
+    )
+    args = parser.parse_args()
+
+    convert_bertabs_checkpoints(
+        args.bertabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
+    )
diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py
new file mode 100644
index 0000000000..ff3171f9a8
--- /dev/null
+++ b/examples/summarization/configuration_bertabs.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2019 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BertAbs configuration """
+import json
+import logging
+import sys
+
+from transformers import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+BERTABS_FINETUNED_CONFIG_MAP = {
+    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
+}
+
+
+class BertAbsConfig(PretrainedConfig):
+    r""" Class to store the configuration of the BertAbs model.
+
+    Arguments:
+        temp_dir: string
+            Unused in the current situation. Kept for compatibility but will be removed.
+        finetune_bert: bool
+            Whether to fine-tune the model or not. Will be kept for reference
+            in case we want to add the possibility to fine-tune the model.
+        large: bool
+            Whether to use bert-large as a base.
+        share_emb: book
+            Whether the embeddings are shared between the encoder and decoder.
+        encoder: string
+            Not clear what this does. Leave to "bert" for pre-trained weights.
+        max_pos: int
+            The maximum sequence length that this model will be used with.
+        enc_layer: int
+            The numner of hidden layers in the Transformer encoder.
+        enc_hidden_size: int
+            The size of the encoder's layers.
+        enc_heads: int
+            The number of attention heads for each attention layer in the encoder.
+        enc_ff_size: int
+            The size of the encoder's feed-forward layers.
+        enc_dropout: int
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, layers, pooler and also the attention probabilities in
+            the encoder.
+        dec_layer: int
+            The numner of hidden layers in the decoder.
+        dec_hidden_size: int
+            The size of the decoder's layers.
+        dec_heads: int
+            The number of attention heads for each attention layer in the decoder.
+        dec_ff_size: int
+            The size of the decoder's feed-forward layers.
+        dec_dropout: int
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, layers, pooler and also the attention probabilities in
+            the decoder.
+    """
+
+    pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=30522,
+        temp_dir=".",
+        finetune_bert=False,
+        large=False,
+        share_emb=True,
+        encoder="bert",
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+        **kwargs,
+    ):
+        super(BertAbsConfig, self).__init__(**kwargs)
+
+        if self._input_is_path_to_json(vocab_size_or_config_json_file):
+            path_to_json = vocab_size_or_config_json_file
+            with open(path_to_json, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.temp_dir = temp_dir
+            self.finetune_bert = finetune_bert
+            self.large = large
+            self.vocab_size = vocab_size_or_config_json_file
+            self.max_pos = max_pos
+
+            self.encoder = encoder
+            self.enc_layers = enc_layers
+            self.enc_hidden_size = enc_hidden_size
+            self.enc_heads = enc_heads
+            self.enc_ff_size = enc_ff_size
+            self.enc_dropout = enc_dropout
+
+            self.share_emb = share_emb
+
+            self.dec_layers = dec_layers
+            self.dec_hidden_size = dec_hidden_size
+            self.dec_heads = dec_heads
+            self.dec_ff_size = dec_ff_size
+            self.dec_dropout = dec_dropout
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    def _input_is_path_to_json(self, first_argument):
+        """ Checks whether the first argument passed to config
+        is the path to a JSON file that contains the config.
+        """
+        is_python_2 = sys.version_info[0] == 2
+        if is_python_2:
+            return isinstance(first_argument, unicode)
+        else:
+            return isinstance(first_argument, str)
diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
new file mode 100644
index 0000000000..786a29ef13
--- /dev/null
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert BertExtAbs's checkpoints
+
+The file currently does not do much as we ended up copying the exact model
+structure, but I leave it here in case we ever want to refactor the model.
+"""
+
+import argparse
+from collections import namedtuple
+import logging
+import torch
+
+from models.model_builder import AbsSummarizer  # The authors' implementation
+from model_bertabs import BertAbsSummarizer
+
+from transformers import BertTokenizer
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+SAMPLE_TEXT = 'Hello world! cécé herlolip'
+
+
+BertAbsConfig = namedtuple(
+    "BertAbsConfig",
+    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+)
+
+
+def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
+    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    of BertAbs for the internal architecture.
+    """
+
+    # Instantiate the authors' model with the pre-trained weights
+    config = BertAbsConfig(
+        temp_dir=".",
+        finetune_bert=False,
+        large=False,
+        share_emb=True,
+        use_bert_emb=False,
+        encoder="bert",
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+    )
+    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
+    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
+    original.eval()
+
+    new_model = BertAbsSummarizer(config, torch.device("cpu"))
+    new_model.eval()
+
+    # -------------------
+    # Convert the weights
+    # -------------------
+
+    logging.info("convert the model")
+    new_model.bert.load_state_dict(original.bert.state_dict())
+    new_model.decoder.load_state_dict(original.decoder.state_dict())
+    new_model.generator.load_state_dict(original.generator.state_dict())
+
+    # ----------------------------------
+    # Make sure the outpus are identical
+    # ----------------------------------
+
+    logging.info("Make sure that the models' outputs are identical")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    # prepare the model inputs
+    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
+    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
+    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
+    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
+    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
+    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
+
+    # failsafe to make sure the weights reset does not affect the
+    # loaded weights.
+    assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
+
+    # forward pass
+    src = encoder_input_ids
+    tgt = decoder_input_ids
+    segs = token_type_ids = None
+    clss = None
+    mask_src = encoder_attention_mask = None
+    mask_tgt = decoder_attention_mask = None
+    mask_cls = None
+
+    # The original model does not apply the geneator layer immediatly but rather in
+    # the beam search (where it combines softmax + linear layer). Since we already
+    # apply the softmax in our generation process we only apply the linear layer here.
+    # We make sure that the outputs of the full stack are identical
+    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
+    output_original_generator = original.generator(output_original_model)
+
+    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
+    output_converted_generator = new_model.generator(output_converted_model)
+
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+
+    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
+    if are_identical:
+        logging.info("all weights are equal up to 1e-3")
+    else:
+        raise ValueError("the weights are different. The new model is likely different from the original one.")
+
+    # The model has been saved with torch.save(model) and this is bound to the exact
+    # directory structure. We save the state_dict instead.
+    logging.info("saving the model's state dictionary")
+    torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bertabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
+    )
+    args = parser.parse_args()
+
+    convert_bertabs_checkpoints(
+        args.bertabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
+    )
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
new file mode 100644
index 0000000000..0189a2ad2b
--- /dev/null
+++ b/examples/summarization/modeling_bertabs.py
@@ -0,0 +1,1250 @@
+# MIT License
+
+# Copyright (c) 2019 Yang Liu
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import copy
+import math
+import shutil
+import time
+import os
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn.init import xavier_uniform_
+
+from transformers import BertModel, BertConfig, PreTrainedModel
+
+from configuration_bertabs import BertAbsConfig
+
+
+MAX_SIZE = 5000
+
+BERTABS_FINETUNED_MODEL_MAP = {
+    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin",
+}
+
+
+class BertAbsPreTrainedModel(PreTrainedModel):
+    config_class = BertAbsConfig
+    pretrained_model_archive_map = BERTABS_FINETUNED_MODEL_MAP
+    load_tf_weights = False
+    base_model_prefix = "bert"
+
+
+class BertAbs(BertAbsPreTrainedModel):
+    def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
+        super(BertAbs, self).__init__(args)
+        self.args = args
+        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)
+
+        # If pre-trained weights are passed for Bert, load these.
+        load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
+        if load_bert_pretrained_extractive:
+            self.bert.model.load_state_dict(
+                dict(
+                    [
+                        (n[11:], p)
+                        for n, p in bert_extractive_checkpoint.items()
+                        if n.startswith("bert.model")
+                    ]
+                ),
+                strict=True,
+            )
+
+        if args.encoder == "baseline":
+            bert_config = BertConfig(
+                self.bert.model.config.vocab_size,
+                hidden_size=args.enc_hidden_size,
+                num_hidden_layers=args.enc_layers,
+                num_attention_heads=8,
+                intermediate_size=args.enc_ff_size,
+                hidden_dropout_prob=args.enc_dropout,
+                attention_probs_dropout_prob=args.enc_dropout,
+            )
+            self.bert.model = BertModel(bert_config)
+
+        self.vocab_size = self.bert.model.config.vocab_size
+
+        if args.max_pos > 512:
+            my_pos_embeddings = nn.Embedding(
+                args.max_pos, self.bert.model.config.hidden_size
+            )
+            my_pos_embeddings.weight.data[
+                :512
+            ] = self.bert.model.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[
+                512:
+            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
+                None, :
+            ].repeat(
+                args.max_pos - 512, 1
+            )
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
+        )
+        if self.args.share_emb:
+            tgt_embeddings.weight = copy.deepcopy(
+                self.bert.model.embeddings.word_embeddings.weight
+            )
+
+        self.decoder = TransformerDecoder(
+            self.args.dec_layers,
+            self.args.dec_hidden_size,
+            heads=self.args.dec_heads,
+            d_ff=self.args.dec_ff_size,
+            dropout=self.args.dec_dropout,
+            embeddings=tgt_embeddings,
+            vocab_size=self.vocab_size,
+        )
+
+        gen_func = nn.LogSoftmax(dim=-1)
+        self.generator = nn.Sequential(
+            nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
+        )
+        self.generator[0].weight = self.decoder.embeddings.weight
+
+        load_from_checkpoints = False if checkpoint is None else True
+        if load_from_checkpoints:
+            self.load_state_dict(checkpoint)
+
+    def init_weights(self):
+        for module in self.decoder.modules():
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        for p in self.generator.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                p.data.zero_()
+
+    def maybe_tie_embeddings(self, args):
+        if args.use_bert_emb:
+            tgt_embeddings = nn.Embedding(
+                self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
+            )
+            tgt_embeddings.weight = copy.deepcopy(
+                self.bert.model.embeddings.word_embeddings.weight
+            )
+            self.decoder.embeddings = tgt_embeddings
+
+    def forward(
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        token_type_ids,
+        encoder_attention_mask,
+        decoder_attention_mask,
+    ):
+        encoder_output = self.bert(
+            input_ids=encoder_input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=encoder_attention_mask,
+        )
+        encoder_hidden_states = encoder_output[0]
+        dec_state = self.decoder.init_decoder_state(
+            encoder_input_ids, encoder_hidden_states
+        )
+        decoder_outputs, _ = self.decoder(
+            decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
+        )
+        return decoder_outputs
+
+
+class Bert(nn.Module):
+    """ This class is not really necessary and should probably disappear.
+    """
+
+    def __init__(self, large, temp_dir, finetune=False):
+        super(Bert, self).__init__()
+        if large:
+            self.model = BertModel.from_pretrained("bert-large-uncased", cache_dir=temp_dir)
+        else:
+            self.model = BertModel.from_pretrained("bert-base-uncased", cache_dir=temp_dir)
+
+        self.finetune = finetune
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
+        self.eval()
+        with torch.no_grad():
+            encoder_outputs, _ = self.model(
+                input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                **kwargs
+            )
+        return encoder_outputs
+
+
+class TransformerDecoder(nn.Module):
+    """
+    The Transformer decoder from "Attention is All You Need".
+
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
+        super(TransformerDecoder, self).__init__()
+
+        # Basic attributes.
+        self.decoder_type = "transformer"
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim)
+
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList(
+            [
+                TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+    # forward(input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask)
+    # def forward(self, input_ids, state, attention_mask=None, memory_lengths=None,
+    # step=None, cache=None, encoder_attention_mask=None, encoder_hidden_states=None, memory_masks=None):
+    def forward(
+        self,
+        input_ids,
+        encoder_hidden_states=None,
+        state=None,
+        attention_mask=None,
+        memory_lengths=None,
+        step=None,
+        cache=None,
+        encoder_attention_mask=None,
+    ):
+        """
+        See :obj:`onmt.modules.RNNDecoderBase.forward()`
+        memory_bank = encoder_hidden_states
+        """
+        # Name conversion
+        tgt = input_ids
+        memory_bank = encoder_hidden_states
+        memory_mask = encoder_attention_mask
+
+        # src_words = state.src
+        src_words = state.src
+        src_batch, src_len = src_words.size()
+
+        padding_idx = self.embeddings.padding_idx
+
+        # Decoder padding mask
+        tgt_words = tgt
+        tgt_batch, tgt_len = tgt_words.size()
+        tgt_pad_mask = (
+            tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
+        )
+
+        # Encoder padding mask
+        if memory_mask is not None:
+            src_len = memory_mask.size(-1)
+            src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = (
+                src_words.data.eq(padding_idx)
+                .unsqueeze(1)
+                .expand(src_batch, tgt_len, src_len)
+            )
+
+        # Pass through the embeddings
+        emb = self.embeddings(input_ids)
+        output = self.pos_emb(emb, step)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+
+        if state.cache is None:
+            saved_inputs = []
+
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if state.cache is None:
+                if state.previous_input is not None:
+                    prev_layer_input = state.previous_layer_inputs[i]
+
+            output, all_input = self.transformer_layers[i](
+                output,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=prev_layer_input,
+                layer_cache=state.cache["layer_{}".format(i)]
+                if state.cache is not None
+                else None,
+                step=step,
+            )
+            if state.cache is None:
+                saved_inputs.append(all_input)
+
+        if state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+
+        output = self.layer_norm(output)
+
+        if state.cache is None:
+            state = state.update_state(tgt, saved_inputs)
+
+        # Decoders in transformers return a tuple. Beam search will fail
+        # if we don't follow this convention.
+        return output, state  # , state
+
+    def init_decoder_state(self, src, memory_bank, with_cache=False):
+        """ Init decoder state """
+        state = TransformerDecoderState(src)
+        if with_cache:
+            state._init_cache(memory_bank, self.num_layers)
+        return state
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, dropout, dim, max_len=5000):
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
+        )
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        super(PositionalEncoding, self).__init__()
+        self.register_buffer("pe", pe)
+        self.dropout = nn.Dropout(p=dropout)
+        self.dim = dim
+
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if step:
+            emb = emb + self.pe[:, step][:, None, :]
+
+        else:
+            emb = emb + self.pe[:, : emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+    def get_emb(self, emb):
+        return self.pe[:, : emb.size(1)]
+
+
+class TransformerDecoderLayer(nn.Module):
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+
+        self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer("mask", mask)
+
+    def forward(
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        previous_input=None,
+        layer_cache=None,
+        step=None,
+    ):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
+        )
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type="self",
+        )
+
+        query = self.drop(query) + inputs
+
+        query_norm = self.layer_norm_2(query)
+        mid = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type="context",
+        )
+        output = self.feed_forward(self.drop(mid) + query)
+
+        return output, all_input
+        # return output
+
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+
+        Args:
+            size: int
+
+        Returns:
+            (`LongTensor`):
+
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+
+
+class MultiHeadedAttention(nn.Module):
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+
+    .. mermaid::
+
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
+
+    Also includes several additional tricks.
+
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+
+    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super(MultiHeadedAttention, self).__init__()
+        self.head_count = head_count
+
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if self.use_final_linear:
+            self.final_linear = nn.Linear(model_dim, model_dim)
+
+    def forward(
+        self,
+        key,
+        value,
+        query,
+        mask=None,
+        layer_cache=None,
+        type=None,
+        predefined_graph_1=None,
+    ):
+        """
+        Compute the context vector and the attention vectors.
+
+        Args:
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
+        """
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+        key_len = key.size(1)
+        query_len = query.size(1)
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return (
+                x.transpose(1, 2)
+                .contiguous()
+                .view(batch_size, -1, head_count * dim_per_head)
+            )
+
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == "self":
+                query, key, value = (
+                    self.linear_query(query),
+                    self.linear_keys(query),
+                    self.linear_values(query),
+                )
+
+                key = shape(key)
+                value = shape(value)
+
+                if layer_cache is not None:
+                    device = key.device
+                    if layer_cache["self_keys"] is not None:
+                        key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
+                    if layer_cache["self_values"] is not None:
+                        value = torch.cat(
+                            (layer_cache["self_values"].to(device), value), dim=2
+                        )
+                    layer_cache["self_keys"] = key
+                    layer_cache["self_values"] = value
+            elif type == "context":
+                query = self.linear_query(query)
+                if layer_cache is not None:
+                    if layer_cache["memory_keys"] is None:
+                        key, value = self.linear_keys(key), self.linear_values(value)
+                        key = shape(key)
+                        value = shape(value)
+                    else:
+                        key, value = (
+                            layer_cache["memory_keys"],
+                            layer_cache["memory_values"],
+                        )
+                    layer_cache["memory_keys"] = key
+                    layer_cache["memory_values"] = value
+                else:
+                    key, value = self.linear_keys(key), self.linear_values(value)
+                    key = shape(key)
+                    value = shape(value)
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+
+        query = shape(query)
+
+        key_len = key.size(2)
+        query_len = query.size(2)
+
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e18)
+
+        # 3) Apply attention dropout and compute context vectors.
+
+        attn = self.softmax(scores)
+
+        if not predefined_graph_1 is None:
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+
+        drop_attn = self.dropout(attn)
+        if self.use_final_linear:
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            return context
+
+
+class DecoderState(object):
+    """Interface for grouping together the current state of a recurrent
+    decoder. In the simplest case just represents the hidden state of
+    the model.  But can also be used for implementing various forms of
+    input_feeding and non-recurrent models.
+
+    Modules need to implement this to utilize beam search decoding.
+    """
+
+    def detach(self):
+        """ Need to document this """
+        self.hidden = tuple([_.detach() for _ in self.hidden])
+        self.input_feed = self.input_feed.detach()
+
+    def beam_update(self, idx, positions, beam_size):
+        """ Need to document this """
+        for e in self._all:
+            sizes = e.size()
+            br = sizes[1]
+            if len(sizes) == 3:
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
+                    :, :, idx
+                ]
+            else:
+                sent_states = e.view(
+                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
+                )[:, :, idx]
+
+            sent_states.data.copy_(sent_states.data.index_select(1, positions))
+
+    def map_batch_fn(self, fn):
+        raise NotImplementedError()
+
+
+class TransformerDecoderState(DecoderState):
+    """ Transformer Decoder state base class """
+
+    def __init__(self, src):
+        """
+        Args:
+            src (FloatTensor): a sequence of source words tensors
+                    with optional feature tensors, of size (len x batch).
+        """
+        self.src = src
+        self.previous_input = None
+        self.previous_layer_inputs = None
+        self.cache = None
+
+    @property
+    def _all(self):
+        """
+        Contains attributes that need to be updated in self.beam_update().
+        """
+        if self.previous_input is not None and self.previous_layer_inputs is not None:
+            return (self.previous_input, self.previous_layer_inputs, self.src)
+        else:
+            return (self.src,)
+
+    def detach(self):
+        if self.previous_input is not None:
+            self.previous_input = self.previous_input.detach()
+        if self.previous_layer_inputs is not None:
+            self.previous_layer_inputs = self.previous_layer_inputs.detach()
+        self.src = self.src.detach()
+
+    def update_state(self, new_input, previous_layer_inputs):
+        state = TransformerDecoderState(self.src)
+        state.previous_input = new_input
+        state.previous_layer_inputs = previous_layer_inputs
+        return state
+
+    def _init_cache(self, memory_bank, num_layers):
+        self.cache = {}
+
+        for l in range(num_layers):
+            layer_cache = {"memory_keys": None, "memory_values": None}
+            layer_cache["self_keys"] = None
+            layer_cache["self_values"] = None
+            self.cache["layer_{}".format(l)] = layer_cache
+
+    def repeat_beam_size_times(self, beam_size):
+        """ Repeat beam_size times along batch dimension. """
+        self.src = self.src.data.repeat(1, beam_size, 1)
+
+    def map_batch_fn(self, fn):
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+
+        self.src = fn(self.src, 0)
+        if self.cache is not None:
+            _recursive_map(self.cache)
+
+
+def gelu(x):
+    return (
+        0.5
+        * x
+        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    )
+
+
+class PositionwiseFeedForward(nn.Module):
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.actv = gelu
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+#
+# TRANSLATOR
+# The following code is used to generate summaries using the
+# pre-trained weights and beam search.
+#
+
+
+def build_predictor(args, tokenizer, symbols, model, logger=None):
+    # we should be able to refactor the global scorer a lot
+    scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
+    translator = Translator(
+        args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
+    )
+    return translator
+
+
+class GNMTGlobalScorer(object):
+    """
+    NMT re-ranking score from
+    "Google's Neural Machine Translation System" :cite:`wu2016google`
+
+    Args:
+       alpha (float): length parameter
+       beta (float):  coverage parameter
+    """
+
+    def __init__(self, alpha, length_penalty):
+        self.alpha = alpha
+        penalty_builder = PenaltyBuilder(length_penalty)
+        self.length_penalty = penalty_builder.length_penalty()
+
+    def score(self, beam, logprobs):
+        """
+        Rescores a prediction based on penalty functions
+        """
+        normalized_probs = self.length_penalty(beam, logprobs, self.alpha)
+        return normalized_probs
+
+
+class PenaltyBuilder(object):
+    """
+    Returns the Length and Coverage Penalty function for Beam Search.
+
+    Args:
+        length_pen (str): option name of length pen
+        cov_pen (str): option name of cov pen
+    """
+
+    def __init__(self, length_pen):
+        self.length_pen = length_pen
+
+    def length_penalty(self):
+        if self.length_pen == "wu":
+            return self.length_wu
+        elif self.length_pen == "avg":
+            return self.length_average
+        else:
+            return self.length_none
+
+    """
+    Below are all the different penalty terms implemented so far
+    """
+
+    def length_wu(self, beam, logprobs, alpha=0.0):
+        """
+        NMT length re-ranking score from
+        "Google's Neural Machine Translation System" :cite:`wu2016google`.
+        """
+
+        modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha)
+        return logprobs / modifier
+
+    def length_average(self, beam, logprobs, alpha=0.0):
+        """
+        Returns the average probability of tokens in a sequence.
+        """
+        return logprobs / len(beam.next_ys)
+
+    def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):
+        """
+        Returns unmodified scores.
+        """
+        return logprobs
+
+
+class Translator(object):
+    """
+    Uses a model to translate a batch of sentences.
+
+    Args:
+       model (:obj:`onmt.modules.NMTModel`):
+          NMT model to use for translation
+       fields (dict of Fields): data fields
+       beam_size (int): size of beam to use
+       n_best (int): number of translations produced
+       max_length (int): maximum length output to produce
+       global_scores (:obj:`GlobalScorer`):
+         object to rescore final translations
+       copy_attn (bool): use copy attention during translation
+       cuda (bool): use cuda
+       beam_trace (bool): trace beam search for debugging
+       logger(logging.Logger): logger.
+    """
+
+    def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None):
+        self.logger = logger
+        self.cuda = args.visible_gpus != "-1"
+
+        self.args = args
+        self.model = model
+        self.generator = self.model.generator
+        self.vocab = vocab
+        self.symbols = symbols
+        self.start_token = symbols["BOS"]
+        self.end_token = symbols["EOS"]
+
+        self.global_scorer = global_scorer
+        self.beam_size = args.beam_size
+        self.min_length = args.min_length
+        self.max_length = args.max_length
+
+    def translate(self, batch, step, attn_debug=False):
+        """ Generates summaries from one batch of data.
+        """
+        self.model.eval()
+        with torch.no_grad():
+            batch_data = self.translate_batch(batch)
+            translations = self.from_batch(batch_data)
+        return translations
+
+    def translate_batch(self, batch, fast=False):
+        """
+        Translate a batch of sentences.
+
+        Mostly a wrapper around :obj:`Beam`.
+
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        with torch.no_grad():
+            return self._fast_translate_batch(
+                batch, self.max_length, min_length=self.min_length
+            )
+
+    # Where the beam search lives
+    # I have no idea why it is being called from the method above
+    def _fast_translate_batch(self, batch, max_length, min_length=0):
+        """ Beam Search using the encoder inputs contained in `batch`.
+        """
+
+        # The batch object is funny
+        # Instead of just looking at the size of the arguments we encapsulate
+        # a size argument.
+        # Where is it defined?
+        beam_size = self.beam_size
+        batch_size = batch.batch_size
+        src = batch.src
+        segs = batch.segs
+        mask_src = batch.mask_src
+
+        src_features = self.model.bert(src, segs, mask_src)
+        dec_states = self.model.decoder.init_decoder_state(
+            src, src_features, with_cache=True
+        )
+        device = src_features.device
+
+        # Tile states and memory beam_size times.
+        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
+        src_features = tile(src_features, beam_size, dim=0)
+        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
+        )
+        alive_seq = torch.full(
+            [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
+        )
+
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = torch.tensor(
+            [0.0] + [float("-inf")] * (beam_size - 1), device=device
+        ).repeat(batch_size)
+
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+
+        results = {}
+        results["predictions"] = [[] for _ in range(batch_size)]  # noqa: F812
+        results["scores"] = [[] for _ in range(batch_size)]  # noqa: F812
+        results["gold_score"] = [0] * batch_size
+        results["batch"] = batch
+
+        for step in range(max_length):
+            decoder_input = alive_seq[:, -1].view(1, -1)
+
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+
+            dec_out, dec_states = self.model.decoder(
+                decoder_input, src_features, dec_states, step=step
+            )
+
+            # Generator forward.
+            log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+
+            # Multiply probs by the beam probability.
+            log_probs += topk_log_probs.view(-1).unsqueeze(1)
+
+            alpha = self.global_scorer.alpha
+            length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha
+
+            # Flatten probs into a list of possibilities.
+            curr_scores = log_probs / length_penalty
+
+            if self.args.block_trigram:
+                cur_len = alive_seq.size(1)
+                if cur_len > 3:
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        words = [self.vocab.ids_to_tokens[w] for w in words]
+                        words = " ".join(words).replace(" ##", "").split()
+                        if len(words) <= 3:
+                            continue
+                        trigrams = [
+                            (words[i - 1], words[i], words[i + 1])
+                            for i in range(1, len(words) - 1)
+                        ]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+
+            curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
+            topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
+
+            # Recover log probs.
+            topk_log_probs = topk_scores * length_penalty
+
+            # Resolve beam origin and true word ids.
+            topk_beam_index = topk_ids.div(vocab_size)
+            topk_ids = topk_ids.fmod(vocab_size)
+
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = topk_beam_index + beam_offset[
+                : topk_beam_index.size(0)
+            ].unsqueeze(1)
+            select_indices = batch_index.view(-1)
+
+            # Append last prediction.
+            alive_seq = torch.cat(
+                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
+            )
+
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(1)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(1)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, 1:]))
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(hypotheses[b], key=lambda x: x[0], reverse=True)
+                        score, pred = best_hyp[0]
+
+                        results["scores"][b].append(score)
+                        results["predictions"][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished).view(
+                    -1, alive_seq.size(-1)
+                )
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            dec_states.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices)
+            )
+
+        return results
+
+    def from_batch(self, translation_batch):
+        batch = translation_batch["batch"]
+        assert len(translation_batch["gold_score"]) == len(translation_batch["predictions"])
+        batch_size = batch.batch_size
+
+        preds, _, _, tgt_str, src = (
+            translation_batch["predictions"],
+            translation_batch["scores"],
+            translation_batch["gold_score"],
+            batch.tgt_str,
+            batch.src,
+        )
+
+        translations = []
+        for b in range(batch_size):
+            pred_sents = self.vocab.convert_ids_to_tokens([int(n) for n in preds[b][0]])
+            pred_sents = " ".join(pred_sents).replace(" ##", "")
+            gold_sent = " ".join(tgt_str[b].split())
+            raw_src = [self.vocab.ids_to_tokens[int(t)] for t in src[b]][:500]
+            raw_src = " ".join(raw_src)
+            translation = (pred_sents, gold_sent, raw_src)
+            translations.append(translation)
+
+        return translations
+
+    def _report_rouge(self, gold_path, can_path):
+        self.logger.info("Calculating Rouge")
+        results_dict = test_rouge(self.args.temp_dir, can_path, gold_path)
+        return results_dict
+
+
+def tile(x, count, dim=0):
+    """
+    Tiles x on dimension dim count times.
+    """
+    perm = list(range(len(x.size())))
+    if dim != 0:
+        perm[0], perm[dim] = perm[dim], perm[0]
+        x = x.permute(perm).contiguous()
+    out_size = list(x.size())
+    out_size[0] *= count
+    batch = x.size(0)
+    x = (
+        x.view(batch, -1)
+        .transpose(0, 1)
+        .repeat(count, 1)
+        .transpose(0, 1)
+        .contiguous()
+        .view(*out_size)
+    )
+    if dim != 0:
+        x = x.permute(perm).contiguous()
+    return x
+
+
+#
+# All things ROUGE. Uses `pyrouge` which is a hot mess.
+#
+
+
+def test_rouge(temp_dir, cand, ref):
+    candidates = [line.strip() for line in open(cand, encoding="utf-8")]
+    references = [line.strip() for line in open(ref, encoding="utf-8")]
+    print(len(candidates))
+    print(len(references))
+    assert len(candidates) == len(references)
+
+    cnt = len(candidates)
+    current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
+    tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time))
+    if not os.path.isdir(tmp_dir):
+        os.mkdir(tmp_dir)
+        os.mkdir(tmp_dir + "/candidate")
+        os.mkdir(tmp_dir + "/reference")
+    try:
+
+        for i in range(cnt):
+            if len(references[i]) < 1:
+                continue
+            with open(
+                tmp_dir + "/candidate/cand.{}.txt".format(i), "w", encoding="utf-8"
+            ) as f:
+                f.write(candidates[i])
+            with open(
+                tmp_dir + "/reference/ref.{}.txt".format(i), "w", encoding="utf-8"
+            ) as f:
+                f.write(references[i])
+        r = pyrouge.Rouge155(temp_dir=temp_dir)
+        r.model_dir = tmp_dir + "/reference/"
+        r.system_dir = tmp_dir + "/candidate/"
+        r.model_filename_pattern = "ref.#ID#.txt"
+        r.system_filename_pattern = r"cand.(\d+).txt"
+        rouge_results = r.convert_and_evaluate()
+        print(rouge_results)
+        results_dict = r.output_to_dict(rouge_results)
+    finally:
+        pass
+        if os.path.isdir(tmp_dir):
+            shutil.rmtree(tmp_dir)
+    return results_dict
+
+
+def rouge_results_to_str(results_dict):
+    return ">> ROUGE-F(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\nROUGE-R(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\n".format(
+        results_dict["rouge_1_f_score"] * 100,
+        results_dict["rouge_2_f_score"] * 100,
+        results_dict["rouge_l_f_score"] * 100,
+        results_dict["rouge_1_recall"] * 100,
+        results_dict["rouge_2_recall"] * 100,
+        results_dict["rouge_l_recall"] * 100,
+    )
+
+
+class BertSumOptimizer(object):
+    """ Specific optimizer for BertSum.
+
+    As described in [1], the authors fine-tune BertSum for abstractive
+    summarization using two Adam Optimizers with different warm-up steps and
+    learning rate. They also use a custom learning rate scheduler.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    """
+
+    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
+        self.encoder = model.encoder
+        self.decoder = model.decoder
+        self.lr = lr
+        self.warmup_steps = warmup_steps
+
+        self.optimizers = {
+            "encoder": torch.optim.Adam(
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+            "decoder": torch.optim.Adam(
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+        }
+
+        self._step = 0
+        self.current_learning_rates = {}
+
+    def _update_rate(self, stack):
+        return self.lr[stack] * min(
+            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
+        )
+
+    def zero_grad(self):
+        self.optimizer_decoder.zero_grad()
+        self.optimizer_encoder.zero_grad()
+
+    def step(self):
+        self._step += 1
+        for stack, optimizer in self.optimizers.items():
+            new_rate = self._update_rate(stack)
+            for param_group in optimizer.param_groups:
+                param_group["lr"] = new_rate
+            optimizer.step()
+            self.current_learning_rates[stack] = new_rate
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
new file mode 100644
index 0000000000..e3b974acd9
--- /dev/null
+++ b/examples/summarization/run_summarization.py
@@ -0,0 +1,271 @@
+import argparse
+from collections import namedtuple
+import logging
+import os
+import sys
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from tqdm import tqdm
+
+from transformers import BertTokenizer
+
+from modeling_bertabs import BertAbs, build_predictor
+
+from utils_summarization import (
+    SummarizationDataset,
+    encode_for_summarization,
+    build_mask,
+    fit_to_block_size,
+    compute_token_type_ids,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+Batch = namedtuple(
+    "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
+)
+
+
+def evaluate(args):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+    model = bertabs = BertAbs.from_pretrained(
+        "bertabs-finetuned-{}".format(args.finetuned_model)
+    )
+    bertabs.to(args.device)
+    bertabs.eval()
+
+    symbols = {
+        "BOS": tokenizer.vocab["[unused0]"],
+        "EOS": tokenizer.vocab["[unused1]"],
+        "PAD": tokenizer.vocab["[PAD]"],
+    }
+
+    # these (unused) arguments are defined to keep the compatibility
+    # with the legacy code and will be deleted in a next iteration.
+    args.result_path = ""
+    args.temp_dir = ""
+
+    data_iterator = build_data_iterator(args, tokenizer)
+    predictor = build_predictor(args, tokenizer, symbols, model)
+
+    logger.info("***** Running evaluation *****")
+    logger.info("  Number examples = %d", len(data_iterator.dataset))
+    logger.info("  Batch size = %d", args.batch_size)
+    logger.info("")
+    logger.info("***** Beam Search parameters *****")
+    logger.info("  Beam size = %d", args.beam_size)
+    logger.info("  Minimum length = %d", args.min_length)
+    logger.info("  Maximum length = %d", args.max_length)
+    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
+    logger.info("  Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
+
+    for batch in tqdm(data_iterator):
+        batch_data = predictor.translate_batch(batch)
+        translations = predictor.from_batch(batch_data)
+        summaries = [format_summary(t) for t in translations]
+        save_summaries(summaries, args.summaries_output_dir, batch.document_names)
+
+
+def format_summary(translation):
+    """ Transforms the output of the `from_batch` function
+    into nicely formatted summaries.
+    """
+    raw_summary, _, _ = translation
+    summary = (
+        raw_summary.replace("[unused0]", "")
+        .replace("[unused3]", "")
+        .replace("[PAD]", "")
+        .replace("[unused1]", "")
+        .replace(r" +", " ")
+        .replace(" [unused2] ", ". ")
+        .replace("[unused2]", "")
+        .strip()
+    )
+
+    return summary
+
+
+def save_summaries(summaries, path, original_document_name):
+    """ Write the summaries in fies that are prefixed by the original
+    files' name with the `_summary` appended.
+
+    Attributes:
+        original_document_names: List[string]
+            Name of the document that was summarized.
+        path: string
+            Path were the summaries will be written
+        summaries: List[string]
+            The summaries that we produced.
+    """
+    for summary, document_name in zip(summaries, original_document_name):
+        # Prepare the summary file's name
+        if "." in document_name:
+            bare_document_name = ".".join(document_name.split(".")[:-1])
+            extension = document_name.split(".")[-1]
+            name = bare_document_name + "_summary." + extension
+        else:
+            name = document_name + "_summary"
+
+        file_path = os.path.join(path, name)
+        with open(file_path, "w") as output:
+            output.write(summary)
+
+
+#
+# LOAD the dataset
+#
+
+
+def build_data_iterator(args, tokenizer):
+    dataset = load_and_cache_examples(args, tokenizer)
+    sampler = SequentialSampler(dataset)
+    collate_fn = lambda data: collate(data, tokenizer, block_size=512)
+    iterator = DataLoader(
+        dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
+    )
+
+    return iterator
+
+
+def load_and_cache_examples(args, tokenizer):
+    dataset = SummarizationDataset(args.documents_dir)
+    return dataset
+
+
+def collate(data, tokenizer, block_size):
+    """ Collate formats the data passed to the data loader.
+
+    In particular we tokenize the data batch after batch to avoid keeping them
+    all in memory. We output the data as a namedtuple to fit the original BertAbs's
+    API.
+    """
+    data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
+    names = [name for name, _, _ in data]
+
+    encoded_text = [
+        encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
+    ]
+    stories = torch.tensor(
+        [
+            fit_to_block_size(story, block_size, tokenizer.pad_token_id)
+            for story, _ in encoded_text
+        ]
+    )
+    encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
+    encoder_mask = build_mask(stories, tokenizer.pad_token_id)
+
+    batch = Batch(
+        document_names=names,
+        batch_size=len(stories),
+        src=stories,
+        segs=encoder_token_type_ids,
+        mask_src=encoder_mask,
+        tgt_str=[""] * len(stories),
+    )
+
+    return batch
+
+
+def decode_summary(summary_tokens, tokenizer):
+    """ Decode the summary and return it in a format
+    suitable for evaluation.
+    """
+    summary_tokens = summary_tokens.to("cpu").numpy()
+    summary = tokenizer.decode(summary_tokens)
+    sentences = summary.split(".")
+    sentences = [s + "." for s in sentences]
+    return sentences
+
+
+def main():
+    """ The main function defines the interface with the users.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--documents_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The folder where the documents to summarize are located.",
+    )
+    parser.add_argument(
+        "--summaries_output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The folder in wich the summaries should be written.",
+    )
+    # EVALUATION options
+    parser.add_argument(
+        "--visible_gpus",
+        default=-1,
+        type=int,
+        help="Number of GPUs with which to do the training.",
+    )
+    parser.add_argument(
+        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+    )
+    # BEAM SEARCH arguments
+    parser.add_argument(
+        "--min_length",
+        default=50,
+        type=int,
+        help="Minimum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--max_length",
+        default=200,
+        type=int,
+        help="Maixmum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=5,
+        type=int,
+        help="The number of beams to start with for each example.",
+    )
+    parser.add_argument(
+        "--alpha",
+        default=0.95,
+        type=float,
+        help="The value of alpha for the length penalty in the beam search.",
+    )
+    parser.add_argument(
+        "--block_trigram",
+        default=True,
+        type=bool,
+        help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
+    )
+    args = parser.parse_args()
+    args.device = torch.device("cpu") if args.visible_gpus == -1 else torch.device("cuda")
+
+    if not documents_dir_is_valid(args.documents_dir):
+        raise FileNotFoundError(
+            "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
+        )
+    maybe_create_output_dir(args.summaries_output_dir)
+
+    evaluate(args)
+
+
+def documents_dir_is_valid(path):
+    if not os.path.exists(path):
+        return False
+
+    file_list = os.listdir(path)
+    if len(file_list) == 0:
+        return False
+
+    return True
+
+
+def maybe_create_output_dir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/utils_summarization.py b/examples/summarization/utils_summarization.py
similarity index 77%
rename from examples/utils_summarization.py
rename to examples/summarization/utils_summarization.py
index 8e95a04e19..e7401b1754 100644
--- a/examples/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -10,9 +10,14 @@ from torch.utils.data import Dataset
 # ------------
 
 
-class CNNDailyMailDataset(Dataset):
+class SummarizationDataset(Dataset):
     """ Abstracts the dataset used to train seq2seq models.
 
+    The class will process the documents that are located in the specified
+    folder. The preprocessing will work on any document that is reasonably
+    formatted. On the CNN/DailyMail dataset it will extract both the story
+    and the summary.
+
     CNN/Daily News:
 
     The CNN/Daily News raw datasets are downloaded from [1]. The stories are
@@ -25,32 +30,31 @@ class CNNDailyMailDataset(Dataset):
     [2] https://github.com/abisee/cnn-dailymail/
     """
 
-    def __init__(self, data_dir="", prefix="train"):
-        assert os.path.isdir(data_dir)
+    def __init__(self, path="", prefix="train"):
+        """ We initialize the class by listing all the documents to summarize.
+        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
+        """
+        assert os.path.isdir(path)
 
-        # We initialize the class by listing all the files that contain
-        # stories and summaries. Files are not read in memory given
-        # the size of the corpus.
-        self.stories_path = []
-        datasets = ("cnn", "dailymail")
-        for dataset in datasets:
-            path_to_stories = os.path.join(data_dir, dataset, "stories")
-            story_filenames_list = os.listdir(path_to_stories)
-            for story_filename in story_filenames_list:
-                path_to_story = os.path.join(path_to_stories, story_filename)
-                if not os.path.isfile(path_to_story):
-                    continue
-                self.stories_path.append(path_to_story)
+        self.documents = []
+        story_filenames_list = os.listdir(path)
+        for story_filename in story_filenames_list:
+            path_to_story = os.path.join(path, story_filename)
+            if not os.path.isfile(path_to_story):
+                continue
+            self.documents.append(path_to_story)
 
     def __len__(self):
-        return len(self.stories_path)
+        """ Returns the number of documents. """
+        return len(self.documents)
 
     def __getitem__(self, idx):
-        story_path = self.stories_path[idx]
-        with open(story_path, encoding="utf-8") as source:
+        document_path = self.documents[idx]
+        document_name = document_path.split("/")[-1]
+        with open(document_path, encoding="utf-8") as source:
             raw_story = source.read()
             story_lines, summary_lines = process_story(raw_story)
-        return story_lines, summary_lines
+        return document_name, story_lines, summary_lines
 
 
 def process_story(raw_story):
@@ -80,7 +84,7 @@ def process_story(raw_story):
             story_lines.append(element)
         except IndexError:
             # if "@highlight" is absent from the file we pop
-            # all elements until there is None.
+            # all elements until there is None, raising an exception.
             return story_lines, []
 
     # gather summary lines
@@ -114,14 +118,6 @@ def fit_to_block_size(sequence, block_size, pad_token_id):
         return sequence
 
 
-def build_lm_labels(sequence, pad_token_id):
-    """ Padding token are replaced by the value -1 so they
-    are not taken into account in the loss computation. """
-    padded = sequence.clone()
-    padded[padded == pad_token_id] = -1
-    return padded
-
-
 def build_mask(sequence, pad_token_id):
     """ Builds the mask. The attention mechanism will only attend to positions
     with value 1. """
@@ -165,7 +161,7 @@ def compute_token_type_ids(batch, separator_token_id):
     """
     batch_embeddings = []
     for sequence in batch:
-        sentence_num = 0
+        sentence_num = -1
         embeddings = []
         for s in sequence:
             if s == separator_token_id:
diff --git a/examples/utils_summarization_test.py b/examples/summarization/utils_summarization_test.py
similarity index 88%
rename from examples/utils_summarization_test.py
rename to examples/summarization/utils_summarization_test.py
index 1d56ff0803..8bfbf6ab23 100644
--- a/examples/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
@@ -21,7 +21,6 @@ from utils_summarization import (
     compute_token_type_ids,
     fit_to_block_size,
     build_mask,
-    build_lm_labels,
     process_story,
 )
 
@@ -88,20 +87,6 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         expected_summary_lines = ["It was the best of times."]
         self.assertEqual(expected_summary_lines, summary_lines)
 
-    def test_build_lm_labels_no_padding(self):
-        sequence = torch.tensor([1, 2, 3, 4])
-        expected = sequence
-        np.testing.assert_array_equal(
-            build_lm_labels(sequence, 0).numpy(), expected.numpy()
-        )
-
-    def test_build_lm_labels(self):
-        sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
-        expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
-        np.testing.assert_array_equal(
-            build_lm_labels(sequence, 0).numpy(), expected.numpy()
-        )
-
     def test_build_mask_no_padding(self):
         sequence = torch.tensor([1, 2, 3, 4])
         expected = torch.tensor([1, 1, 1, 1])
@@ -125,7 +110,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
             [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
         )
         expected = torch.tensor(
-            [[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
+            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
         )
 
         result = compute_token_type_ids(batch, separator)
diff --git a/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000..4f158966e1
--- /dev/null
+++ b/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert BertExtAbs's checkpoints """
+
+import argparse
+from collections import namedtuple
+import logging
+
+import torch
+
+from models.model_builder import AbsSummarizer  # The authors' implementation
+
+from transformers import BertConfig, Model2Model, BertModel, BertForMaskedLM
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+BertExtAbsConfig = namedtuple(
+    "BertExtAbsConfig",
+    ["temp_dir", "large", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+)
+
+
+def convert_bertextabs_checkpoints(path_to_checkpoints, dump_path):
+    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    of BertExtAbs for the internal architecture.
+    """
+
+    # Load checkpoints in memory
+    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
+
+    # Instantiate the authors' model with the pre-trained weights
+    config = BertExtAbsConfig(
+        temp_dir=".",
+        finetune_bert=False,
+        large=False,
+        share_emb=True,
+        encoder="bert",
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+    )
+    bertextabs = AbsSummarizer(config, torch.device("cpu"), checkpoints)
+    bertextabs.eval()
+
+    # Instantiate our version of the model
+    decoder_config = BertConfig(
+        hidden_size=config.dec_hidden_size,
+        num_hidden_layers=config.dec_layers,
+        num_attention_heads=config.dec_heads,
+        intermediate_size=config.dec_ff_size,
+        hidden_dropout_prob=config.dec_dropout,
+        attention_probs_dropout_prob=config.dec_dropout,
+        is_decoder=True,
+    )
+
+    decoder_model = BertForMaskedLM(decoder_config)
+    model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder_model)
+    model.eval()
+
+    # Let us now start the weight copying process
+    model.encoder.load_state_dict(bertextabs.bert.model.state_dict())
+
+    # Decoder
+
+    # Embeddings. The positional embeddings are equal to the word embedding plus a modulation
+    # that is computed at each forward pass. This may be a source of discrepancy.
+    model.decoder.bert.embeddings.word_embeddings.weight = bertextabs.decoder.embeddings.weight
+    model.decoder.bert.embeddings.position_embeddings.weight = bertextabs.decoder.embeddings.weight
+    model.decoder.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(bertextabs.decoder.embeddings.weight)  # not defined for BertExtAbs decoder
+
+    # In the original code the LayerNorms are applied twice in the layers, at the beginning and between the
+    # attention layers.
+    model.decoder.bert.embeddings.LayerNorm.weight = bertextabs.decoder.transformer_layers[0].layer_norm_1.weight
+
+    for i in range(config.dec_layers):
+
+        # self attention
+        model.decoder.bert.encoder.layer[i].attention.self.query.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_query.weight
+        model.decoder.bert.encoder.layer[i].attention.self.key.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_keys.weight
+        model.decoder.bert.encoder.layer[i].attention.self.value.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_values.weight
+        model.decoder.bert.encoder.layer[i].attention.output.dense.weight = bertextabs.decoder.transformer_layers[i].self_attn.final_linear.weight
+        model.decoder.bert.encoder.layer[i].attention.output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i].layer_norm_2.weight
+
+        # attention
+        model.decoder.bert.encoder.layer[i].crossattention.self.query.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_query.weight
+        model.decoder.bert.encoder.layer[i].crossattention.self.key.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_keys.weight
+        model.decoder.bert.encoder.layer[i].crossattention.self.value.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_values.weight
+        model.decoder.bert.encoder.layer[i].crossattention.output.dense.weight = bertextabs.decoder.transformer_layers[i].context_attn.final_linear.weight
+        model.decoder.bert.encoder.layer[i].crossattention.output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i].feed_forward.layer_norm.weight
+
+        # intermediate
+        model.decoder.bert.encoder.layer[i].intermediate.dense.weight = bertextabs.decoder.transformer_layers[i].feed_forward.w_1.weight
+
+        # output
+        model.decoder.bert.encoder.layer[i].output.dense.weight = bertextabs.decoder.transformer_layers[i].feed_forward.w_2.weight
+
+        try:
+            model.decoder.bert.encoder.layer[i].output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i + 1].layer_norm_1.weight
+        except IndexError:
+            model.decoder.bert.encoder.layer[i].output.LayerNorm.weight = bertextabs.decoder.layer_norm.weight
+
+    # LM Head
+    """
+    model.decoder.cls.predictions.transform.dense.weight
+    model.decoder.cls.predictions.transform.dense.biais
+    model.decoder.cls.predictions.transform.LayerNorm.weight
+    model.decoder.cls.predictions.transform.LayerNorm.biais
+    model.decoder.cls.predictions.decoder.weight
+    model.decoder.cls.predictions.decoder.biais
+    model.decoder.cls.predictions.biais.data
+    """
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bertextabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
+    )
+    args = parser.parse_args()
+
+    convert_bertextabs_checkpoints(
+        args.bertextabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
+    )
diff --git a/transformers/generate/beam_search.py b/transformers/generate/beam_search.py
index abe3186049..b56ebbabb8 100644
--- a/transformers/generate/beam_search.py
+++ b/transformers/generate/beam_search.py
@@ -25,7 +25,6 @@ Use Beam Search to generate sequences using encoder-decoder models.
 """
 import torch
 from torch import nn
-
 import logging
 
 
@@ -45,6 +44,7 @@ class BeamSearch(object):
         max_length,
         alpha=0,
         block_repeating_trigrams=True,
+        device=torch.device("cpu"),
     ):
         r"""
         Inputs:
@@ -156,18 +156,24 @@ class BeamSearch(object):
         kwargs_decoder["encoder_hidden_states"] = tile(
             encoder_hidden_states, self.beam_size, dim=0
         )
-        kwargs_decoder["encoder_attention_mask"] = tile(
-            kwargs_encoder["attention_mask"], self.beam_size, dim=0
+        try:
+            kwargs_decoder["encoder_attention_mask"] = tile(
+                kwargs_encoder["attention_mask"], self.beam_size, dim=0
+            )
+        except:
+            pass
+        kwargs_decoder["state"].src = tile(
+            kwargs_decoder["state"].src, self.beam_size, dim=0
         )
 
         # grow the beam iteratively
         batch_size, block_size = encoder_input_ids.size()
         self._init_beam_state(batch_size)
         for step in range(self.max_length):
-
             decoder_input = fit_to_block_size(self.growing_beams, block_size, self.pad_token_id)
             kwargs_decoder["attention_mask"] = build_mask(decoder_input, self.pad_token_id)
-            outputs = self.model.decoder(decoder_input, **kwargs_decoder)
+
+            outputs, state = self.model.decoder(decoder_input, **kwargs_decoder)
 
             next_token_scores = outputs[0][:, -1, :].squeeze(1)
             log_probabilities = torch.nn.functional.log_softmax(next_token_scores, dim=0)
@@ -178,9 +184,13 @@ class BeamSearch(object):
             kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
                 "encoder_hidden_states"
             ].index_select(0, surviving_beams_rows)
-            kwargs_decoder["encoder_attention_mask"] = kwargs_decoder[
-                "encoder_attention_mask"
-            ].index_select(0, surviving_beams_rows)
+            try:
+                kwargs_decoder["encoder_attention_mask"] = kwargs_decoder[
+                    "encoder_attention_mask"
+                ].index_select(0, surviving_beams_rows)
+            except:
+                pass
+            kwargs_decoder["state"] = state
 
         return self.results
 

From c0443df5939d980abbe5bb28b31f08d1628469c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 5 Dec 2019 18:13:41 +0100
Subject: [PATCH 55/91] remove beam search

---
 transformers/generate/beam_search.py    | 376 ------------------------
 transformers/tests/beam_search_tests.py | 243 ---------------
 2 files changed, 619 deletions(-)
 delete mode 100644 transformers/generate/beam_search.py
 delete mode 100644 transformers/tests/beam_search_tests.py

diff --git a/transformers/generate/beam_search.py b/transformers/generate/beam_search.py
deleted file mode 100644
index b56ebbabb8..0000000000
--- a/transformers/generate/beam_search.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# coding=utf-8
-# MIT License
-
-# Copyright (c) 2017-Present OpenNMT
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy of
-# this software and associated documentation files (the "Software"), to deal in
-# the Software without restriction, including without limitation the rights to
-# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-# of the Software, and to permit persons to whom the Software is furnished to do
-# so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""
-Use Beam Search to generate sequences using encoder-decoder models.
-"""
-import torch
-from torch import nn
-import logging
-
-
-logger = logging.getLogger(__name__)
-
-
-class BeamSearch(object):
-    def __init__(
-        self,
-        model,
-        bos_token_id,
-        pad_token_id,
-        eos_token_id,
-        batch_size,
-        beam_size,
-        min_length,
-        max_length,
-        alpha=0,
-        block_repeating_trigrams=True,
-        device=torch.device("cpu"),
-    ):
-        r"""
-        Inputs:
-            **model**: instance of ``transformers.PreTrainedEncoderDecoder``
-                The pretrained encoder-decoder model that will be used to generate the sequences.
-            **bos_token_id**: int
-                Id that is used by the tokenizer to represent the beggining of a sentence.
-            **pad_token_id**: int
-                Id that is used by the tokenizer for padding.
-            **eos_token_id**: int
-                Id that is used by the tokenizer to represent the end of a sentence.
-            **batch_size**: (`optional`) int
-                Batch size of the inputs. The value is set automatically when calling `forward`.
-            **beam_size**: int
-                Number of beams that are used for each element on the batch.
-            **min_length**: int
-                Minimum number of steps performed by the beam search before terminating.
-            **max_length**: int
-                Maximum number of steps performed by the beam search. Any beam that has not finished
-                will return its current solution with the highest probability. The sequence that is
-                returned has a length of max_length-1 to account for the end token that is subsequently added.
-            **alpha**: float
-                Parameter of the length penalty. Read the documentation of the `_length_penalty` method for mode details.
-            **block_repeating_trigrams**: bool
-                Whether to block sequences that have repeating 3-grams.
-        """
-        super(BeamSearch, self).__init__()
-        self.model = model
-        self.device = next(model.parameters()).device  # only works if all parameters of the model are stored on a single GPU
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-
-        self.batch_size = batch_size
-        self.beam_size = beam_size
-        self.min_length = min_length
-        self.max_length = max_length
-
-        self.block_repeating_trigram = block_repeating_trigrams
-        self.apply_length_penalty = False if alpha == 0 else True
-        self.alpha = alpha
-
-        self._init_beam_state(batch_size)
-
-    def __len__(self):
-        return self.growing_beams.size(1)
-
-    def _init_beam_state(self, batch_size):
-        """ (re-)Initialize the state of the beams. """
-        self.hypotheses = [[] for _ in range(batch_size)]
-        self.batch_offset = torch.arange(batch_size, dtype=torch.long, device=self.device)
-        self.beam_offset = torch.arange(
-            0,
-            batch_size * self.beam_size,
-            step=self.beam_size,
-            dtype=torch.long,
-            device=self.device,
-        )
-        self.growing_beams = torch.full(
-            (batch_size * self.beam_size, 1),
-            self.bos_token_id,
-            dtype=torch.long,
-            device=self.device,
-        )
-        self.topk_log_probabilities = torch.tensor(
-            [0.0] + [float("-inf")] * (self.beam_size - 1),
-            dtype=torch.float,
-            device=self.device,
-        ).repeat(batch_size)
-        self.results = {
-            "predictions": [[] for _ in range(batch_size)],
-            "scores": [[] for _ in range(batch_size)],
-        }
-        self._step = 0
-        self.is_done = False
-
-    def __call__(self, encoder_input_ids, **model_kwargs):
-        """ Generate a sequence using Beam Search. """
-        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
-        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
-        # that apply to the model as whole.
-        # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_common = {
-            argument: value
-            for argument, value in model_kwargs.items()
-            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
-        }
-        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder = kwargs_common.copy()
-        kwargs_encoder.update(
-            {
-                argument[len("encoder_") :]: value
-                for argument, value in model_kwargs.items()
-                if argument.startswith("encoder_")
-            }
-        )
-        kwargs_decoder.update(
-            {
-                argument[len("decoder_") :]: value
-                for argument, value in model_kwargs.items()
-                if argument.startswith("decoder_")
-            }
-        )
-
-        # forward pass on the encoder
-        encoder_outputs = self.model.encoder(encoder_input_ids, **kwargs_encoder)
-        encoder_hidden_states = encoder_outputs[0]
-        kwargs_decoder["encoder_hidden_states"] = tile(
-            encoder_hidden_states, self.beam_size, dim=0
-        )
-        try:
-            kwargs_decoder["encoder_attention_mask"] = tile(
-                kwargs_encoder["attention_mask"], self.beam_size, dim=0
-            )
-        except:
-            pass
-        kwargs_decoder["state"].src = tile(
-            kwargs_decoder["state"].src, self.beam_size, dim=0
-        )
-
-        # grow the beam iteratively
-        batch_size, block_size = encoder_input_ids.size()
-        self._init_beam_state(batch_size)
-        for step in range(self.max_length):
-            decoder_input = fit_to_block_size(self.growing_beams, block_size, self.pad_token_id)
-            kwargs_decoder["attention_mask"] = build_mask(decoder_input, self.pad_token_id)
-
-            outputs, state = self.model.decoder(decoder_input, **kwargs_decoder)
-
-            next_token_scores = outputs[0][:, -1, :].squeeze(1)
-            log_probabilities = torch.nn.functional.log_softmax(next_token_scores, dim=0)
-            surviving_beams_rows = self.grow(log_probabilities)
-            if self.is_done:
-                break
-
-            kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
-                "encoder_hidden_states"
-            ].index_select(0, surviving_beams_rows)
-            try:
-                kwargs_decoder["encoder_attention_mask"] = kwargs_decoder[
-                    "encoder_attention_mask"
-                ].index_select(0, surviving_beams_rows)
-            except:
-                pass
-            kwargs_decoder["state"] = state
-
-        return self.results
-
-    def grow(self, log_probabilities):
-        """ Grow the beams by one step. """
-        self._step += 1
-
-        # The number of beams changes as some beams finish so we define _B
-        vocab_size = log_probabilities.size(-1)
-        _B = log_probabilities.size(0) // self.beam_size
-
-        # Multiply each beam probability with the probability of the
-        # next token (conditioned on the words in the beam).
-        log_probabilities += self.topk_log_probabilities.view(-1, 1)
-
-        self._enforce_min_length(log_probabilities)
-        if self.block_repeating_trigram:
-            self._remove_beams_with_repeating_trigrams(log_probabilities, _B)
-
-        # Find the `beam_size` (previous_beam + token) combinations with
-        # the highest score
-        self.topk_log_probabilities, topk_ids = torch.topk(
-            log_probabilities.view(_B, self.beam_size * vocab_size), self.beam_size, dim=1
-        )
-
-        # Apply the length penalty. The +1 accounts for the [EOS] token
-        # that will be added if the beam ends.
-        topk_scores = self.topk_log_probabilities
-        if self.apply_length_penalty:
-            topk_scores /= self._length_penalty()
-
-        # Retrieve the corresponding respective beam and token id
-        # topk_token_ids[i] will be added to topk_beam_ids[i]
-        topk_beam_ids = topk_ids.div(vocab_size)
-        topk_token_ids = topk_ids.fmod(vocab_size)
-
-        # Retrieve the row index of the surviving beams in the original
-        # view of the log_probabilities tensor
-        surviving_beams_per_batch = topk_beam_ids + self.beam_offset[:_B].view(-1, 1)
-        surviving_beams_rows = surviving_beams_per_batch.view(-1)
-
-        # Append the last predictions
-        self.growing_beams = torch.cat(
-            [
-                self.growing_beams.index_select(0, surviving_beams_rows),
-                topk_token_ids.view(-1, 1),
-            ],
-            1,
-        )
-
-        # Check if any of the beam searches has ended during this
-        # growth step. Also if top beam (most probable) has ended
-        # for one element of the batch.
-        is_finished = topk_token_ids.eq(self.eos_token_id)
-        self._enforce_max_length(is_finished)
-        if is_finished.any():
-            non_finished = self._cut_finished(is_finished, topk_scores)
-            self.batch_offset = self.batch_offset.index_select(0, non_finished)
-            surviving_beams_per_batch = surviving_beams_per_batch.index_select(
-                0, non_finished
-            )
-            self.topk_log_probabilities = self.topk_log_probabilities.index_select(
-                0, non_finished
-            )
-
-            surviving_beams_rows = surviving_beams_per_batch.view(-1)
-            self.growing_beams = self.growing_beams.index_select(0, surviving_beams_rows)
-
-        return surviving_beams_rows
-
-    def _cut_finished(self, is_finished, topk_scores):
-        """ Save the finished searches and cut the correponding sequences off
-        the beams. """
-        is_top_beam_finished = is_finished[:, 0].eq(True)
-
-        # Save the finished searches
-        predictions = self.growing_beams.view(
-            -1, self.beam_size, self.growing_beams.size(1)
-        )
-        for i in range(is_finished.size(0)):
-            if is_top_beam_finished[i]:
-                is_finished[i].fill_(1)
-            finished_hyp = is_finished[i].nonzero().view(-1)
-
-            # Store the finished beams as a (score, prediction) hypothesis.
-            b = self.batch_offset[i]
-            for j in finished_hyp:
-                self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
-
-            # If the batch reached the end, save the best hypotheses
-            # in terms of length-penalized score.
-            if is_top_beam_finished[i]:
-                best_score, best_prediction = max(self.hypotheses[b], key=lambda x: x[0])
-                self.results["scores"][b].append(best_score)
-                self.results["predictions"][b].append(best_prediction)
-
-        non_finished = is_top_beam_finished.eq(False).nonzero().view(-1)
-        if len(non_finished) == 0:
-            self.is_done = True
-
-        return non_finished
-
-    def _remove_beams_with_repeating_trigrams(self, log_probabilities, _B):
-        if self._step + 1 > 3:  # [BOS] does not count
-            for i in range(_B * self.beam_size):
-                tokens = self.growing_beams[i]
-                trigrams = [
-                    (tokens[j - 1], tokens[j], tokens[j + 1])
-                    for j in range(1, len(self) - 1)
-                ]
-                last_trigram = tuple(trigrams[-1])
-                if last_trigram in trigrams[:-1]:
-                    log_probabilities[i] = -1e20
-
-    def _enforce_min_length(self, log_probabilities):
-        if self._step < self.min_length:
-            log_probabilities[:, self.eos_token_id] = -1e20
-
-    def _enforce_max_length(self, is_finished):
-        # +1 because we will need to add an [EOS] token
-        if self._step + 1 == self.max_length:
-            is_finished.fill_(1)
-
-    def _length_penalty(self):
-        """ The calculation of the length penalty follows that of [1].
-
-        [1] Wu, Yonghui, et al. "Google's neural machine translation system:
-        Bridging the gap between human and machine translation." arXiv preprint
-        arXiv:1609.08144 (2016).
-        """
-        return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
-
-
-def tile(x, count, dim=0):
-    """
-    Tiles `x` along dimension `dim` `count` times.
-
-    Example:
-        >> ex = torch.tensor([1,2],[3,4])
-        >> tile(ex, 2, 0)
-        torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
-    """
-    perm = list(range(len(x.size())))
-    if dim != 0:
-        perm[0], perm[dim] = perm[dim], perm[0]
-        x = x.permute(perm).contiguous()
-    out_size = list(x.size())
-    out_size[0] *= count
-    batch = x.size(0)
-    x = (
-        x.view(batch, -1)
-        .transpose(0, 1)
-        .repeat(count, 1)
-        .transpose(0, 1)
-        .contiguous()
-        .view(*out_size)
-    )
-    if dim != 0:
-        x = x.permute(perm).contiguous()
-    return x
-
-
-def fit_to_block_size(sequence, block_size, pad_token_id):
-    """ Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter we append padding tokens to the right.
-    """
-    padded_sequence = torch.full(
-        (sequence.size(0), block_size),
-        pad_token_id,
-        dtype=torch.long,
-        device=sequence.device,
-    )
-    padded_sequence[:, : sequence.size(1)] = sequence
-    return sequence
-
-
-def build_mask(sequence, pad_token_id):
-    """ Builds the mask. The attention mechanism will only attend to positions
-    with value 1. """
-    mask = torch.ones_like(sequence)
-    idx_pad_tokens = sequence == pad_token_id
-    mask[idx_pad_tokens] = 0
-    return mask
diff --git a/transformers/tests/beam_search_tests.py b/transformers/tests/beam_search_tests.py
deleted file mode 100644
index 6f2a2b9c2f..0000000000
--- a/transformers/tests/beam_search_tests.py
+++ /dev/null
@@ -1,243 +0,0 @@
-from collections import namedtuple
-import unittest
-import pytest
-import numpy as np
-import torch
-from torch import nn
-
-from transformers.generate import BeamSearch
-from transformers import PreTrainedEncoderDecoder
-
-
-class StubTransformer(nn.Module):
-    def __init__(self):
-        self.encoder = None
-        self.decoder = None
-        self._parameters = {"dumy": torch.tensor([1])}
-
-    def forward(self):
-        pass
-
-
-class BeamSearchtest(unittest.TestCase):
-    def test_beam_search_encoder_decoder_integration(self):
-        """ We make sure that no internal change in the PreTrainedEncoderDecoder
-        class will break the integration with the beam search.
-        """
-
-        model = StubTransformer()
-        try:
-            _ = BeamSearch(
-                model=model,
-                bos_token_id=0,
-                eos_token_id=1,
-                pad_token_id=2,
-                batch_size=1,
-                beam_size=1,
-                min_length=1,
-                max_length=1,
-                alpha=0,
-                block_repeating_trigrams=False,
-            )
-        except:
-            self.fail("Instantiating BeamSearch with a PreTrainedEncoderDecoder failed.")
-
-    def test_beam_search_min_length(self):
-        """ We keep predicting the end_token for the first beam and check that
-        it is not marked as finished until the beam has reached the minimum
-        length. """
-        eos_idx = 3
-        vocab_size = 10
-
-        batch_size = 3
-        beam_size = 2
-        min_length = 5
-
-        beam = BeamSearch(
-            model=StubTransformer(),
-            bos_token_id=0,
-            eos_token_id=eos_idx,
-            pad_token_id=2,
-            batch_size=batch_size,
-            beam_size=beam_size,
-            min_length=5,
-            max_length=10,
-            alpha=0,
-            block_repeating_trigrams=False,
-        )
-
-        # To test that the minimum length is correctly enforced we constantly
-        # assign the highest probability to the [EOS] token (and assign lower
-        # probabilities to some other tokens).
-        # Since BeamSearch will reset its probability to 1e-20 as long as
-        # min_length has not been reached, we need to reset the value between
-        # steps.
-        non_eos_idxs = [4, 5, 1, 8, 9]
-        score_distribution = torch.log_softmax(
-            torch.tensor([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]), dim=0
-        )
-
-        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
-        log_probabilities[0, eos_idx] = score_distribution[0]
-        for idx, score in zip(non_eos_idxs, score_distribution[1:]):
-            log_probabilities[0, idx] = score
-        pytest.set_trace()
-        for step in range(1, min_length + 2):
-            log_probabilities[0, eos_idx] = score_distribution[0]
-
-            # Beam #3 and #4 teminate at the first step since the probability
-            # of the [EOS] token is -1e20 > -\infty so there are only two beams left.
-            # The top beam (most likely) always ends with 4 until we reach min_length.
-            surviving_beams_rows = beam.grow(log_probabilities)
-            if step < min_length:
-                np.testing.assert_array_equal(
-                    beam.growing_beams.numpy()[0, :], np.array([0] + [4] * step)
-                )
-            elif step == min_length:
-                np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([]))
-                self.assertTrue(beam.is_done)
-                break
-
-            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
-
-    def test_beam_search_max_length(self):
-        """ We keep predicting the same non-EOS token until we reach the
-        maximum permitted length """
-        batch_size = 3
-        beam_size = 2
-        max_length = 5
-        vocab_size = 10
-
-        beam = BeamSearch(
-            model=StubTransformer(),
-            bos_token_id=0,
-            eos_token_id=1,
-            pad_token_id=2,
-            batch_size=batch_size,
-            beam_size=beam_size,
-            min_length=2,
-            max_length=max_length,
-            alpha=0,
-            block_repeating_trigrams=False,
-        )
-
-        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
-
-        # To test that beam search enforces the max length constraint we
-        # keep giving the highest probability to a token that is not the
-        # [EOS] token.
-        # The beam search will stop at max_length-1, assuming that one would
-        # add the [EOS] token at the end of the returned sequence.
-        token_idxs = [3, 4, 5]
-        score_distribution = torch.log_softmax(torch.tensor([10.0, 6.0, 4.0]), dim=0)
-        for idx, score in zip(token_idxs, score_distribution):
-            log_probabilities[:, idx] = score
-
-        for step in range(1, max_length + 2):
-            surviving_beams_rows = beam.grow(log_probabilities)
-            if step + 1 < max_length:
-                self.assertFalse(beam.is_done)
-            elif step + 1 == max_length:  # Now [EOS] is the most probable token
-                np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([]))
-                self.assertTrue(beam.is_done)
-                break
-
-            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
-
-    def test_beam_search_block_repeating_trigrams(self):
-        """ We make sure that the beams that contain repeating trigrams are removed. """
-        batch_size = 3
-        beam_size = 2
-        max_length = 10
-        vocab_size = 10
-
-        beam = BeamSearch(
-            model=StubTransformer(),
-            bos_token_id=0,
-            eos_token_id=1,
-            pad_token_id=2,
-            batch_size=batch_size,
-            beam_size=beam_size,
-            min_length=2,
-            max_length=max_length,
-            alpha=0,
-            block_repeating_trigrams=True,
-        )
-
-        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
-
-        # To test that BeamSearch enforces the 3-gram constraint we give the
-        # highest probably to the same tokens in a cyclic fashion and make sure
-        # they disappear once the cycle has completed.
-        token_idxs = [3, 4, 5]
-        score_distribution = torch.log_softmax(torch.tensor([10.0, 6.0, 4.0]), dim=0)
-        for idx, score in zip(token_idxs, score_distribution):
-            log_probabilities[:, idx] = score
-
-        for step in range(1, max_length + 2):
-            # Rotate the probabilities at each step
-            for idx in token_idxs:
-                score = score_distribution[(idx + step) % 3]
-                log_probabilities[::beam_size, idx] = score
-
-            surviving_beams_rows = beam.grow(log_probabilities)
-
-            if step < 7:
-                self.assertFalse(
-                    np.array_equal(
-                        log_probabilities.numpy()[0, :],
-                        np.array([-1e20] * vocab_size, dtype="float32"),
-                    )
-                )
-            if step == 7:
-                np.testing.assert_array_equal(
-                    log_probabilities.numpy()[0, :],
-                    np.array([-1e20] * vocab_size, dtype="float32"),
-                )
-
-            log_probabilities = log_probabilities.index_select(0, surviving_beams_rows)
-
-    def test_beam_search_example_for_one_step(self):
-        """ We test that the predictions for one step of growth are correct. """
-        batch_size = 2
-        beam_size = 2
-        max_length = 10
-        vocab_size = 5
-
-        beam = BeamSearch(
-            model=StubTransformer(),
-            bos_token_id=0,
-            eos_token_id=1,
-            pad_token_id=2,
-            batch_size=batch_size,
-            beam_size=beam_size,
-            min_length=2,
-            max_length=max_length,
-            alpha=0,
-            block_repeating_trigrams=False,
-        )
-
-        log_probabilities = torch.full((batch_size * beam_size, vocab_size), float("-inf"))
-        log_probabilities[0, 3:] = torch.log_softmax(torch.tensor([2.0, 1.0]), dim=0)
-        log_probabilities[2, 3:] = torch.log_softmax(torch.tensor([1.0, 2.0]), dim=0)
-
-        # First pass
-        surviving_beams_rows = beam.grow(log_probabilities)
-        np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([0, 0, 2, 2]))
-        np.testing.assert_array_equal(
-            beam.growing_beams.numpy(), np.array([[0, 3], [0, 4], [0, 4], [0, 3]])
-        )
-        self.assertFalse(beam.is_done)
-
-        # Second pass
-        surviving_beams_rows = beam.grow(log_probabilities)
-        np.testing.assert_array_equal(surviving_beams_rows.numpy(), np.array([0, 0, 2, 2]))
-        np.testing.assert_array_equal(
-            beam.growing_beams.numpy(),
-            np.array([[0, 3, 3], [0, 3, 4], [0, 4, 4], [0, 4, 3]]),
-        )
-        self.assertFalse(beam.is_done)
-
-
-if __name__ == "__name__":
-    unittest.main()

From 693606a75c54d9731b748797f21961d0a5322896 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 5 Dec 2019 18:55:15 +0100
Subject: [PATCH 56/91] update the docs

---
 examples/README.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index dec5a67f7e..3d0b2ca1a9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -24,7 +24,8 @@ pip install -r ./examples/requirements.txt
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
+| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs
+model finetuned on the CNN/DailyMail dataset to generate summaries. |
 
 ## TensorFlow 2.0 Bert models on GLUE
 
@@ -712,3 +713,20 @@ Training with the previously defined hyper-parameters yields the following resul
 ```bash
 acc = 0.7093812375249501
 ```
+
+### Abstractive Summarization
+
+This example provides a simple API for the [BertAbs](https://github.com/nlpyang/PreSumm) model finetuned on the CNN/DailyMail dataset. The script can be used to generate summaries from any text. 
+
+```bash
+python run_summarization.py \
+    --documents_dir 'path/to/documents' \
+    --summaries_output_dir 'path/to/summaries' \
+    --visible_gpus 0,1,2 \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true
+```

From 3a9a9f78614050896356a9a30e9529c502b56d96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 5 Dec 2019 19:09:47 +0100
Subject: [PATCH 57/91] default output dir to documents dir

---
 examples/summarization/run_summarization.py   | 11 ++++++-----
 examples/summarization/utils_summarization.py |  2 ++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index e3b974acd9..bbc79227ca 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -31,9 +31,7 @@ Batch = namedtuple(
 
 def evaluate(args):
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
-    model = bertabs = BertAbs.from_pretrained(
-        "bertabs-finetuned-{}".format(args.finetuned_model)
-    )
+    model = bertabs = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
     bertabs.to(args.device)
     bertabs.eval()
 
@@ -195,8 +193,8 @@ def main():
         "--summaries_output_dir",
         default=None,
         type=str,
-        required=True,
-        help="The folder in wich the summaries should be written.",
+        required=False,
+        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
     )
     # EVALUATION options
     parser.add_argument(
@@ -242,6 +240,9 @@ def main():
     args = parser.parse_args()
     args.device = torch.device("cpu") if args.visible_gpus == -1 else torch.device("cuda")
 
+    if not args.summaries_output_dir:
+        args.summaries_output_dir = args.documents_dir
+
     if not documents_dir_is_valid(args.documents_dir):
         raise FileNotFoundError(
             "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
diff --git a/examples/summarization/utils_summarization.py b/examples/summarization/utils_summarization.py
index e7401b1754..1d8c436ac9 100644
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -39,6 +39,8 @@ class SummarizationDataset(Dataset):
         self.documents = []
         story_filenames_list = os.listdir(path)
         for story_filename in story_filenames_list:
+            if "summary" in story_filename:
+                continue
             path_to_story = os.path.join(path, story_filename)
             if not os.path.isfile(path_to_story):
                 continue

From a1994a71ee37ee8ac5bc49cce30a764392d64233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 5 Dec 2019 21:05:06 +0100
Subject: [PATCH 58/91] simplified model and configuration

---
 .../summarization/configuration_bertabs.py    | 22 ----------
 examples/summarization/modeling_bertabs.py    | 41 ++++---------------
 examples/summarization/run_summarization.py   |  6 +--
 3 files changed, 10 insertions(+), 59 deletions(-)

diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py
index ff3171f9a8..5bcb65b423 100644
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -33,17 +33,6 @@ class BertAbsConfig(PretrainedConfig):
     r""" Class to store the configuration of the BertAbs model.
 
     Arguments:
-        temp_dir: string
-            Unused in the current situation. Kept for compatibility but will be removed.
-        finetune_bert: bool
-            Whether to fine-tune the model or not. Will be kept for reference
-            in case we want to add the possibility to fine-tune the model.
-        large: bool
-            Whether to use bert-large as a base.
-        share_emb: book
-            Whether the embeddings are shared between the encoder and decoder.
-        encoder: string
-            Not clear what this does. Leave to "bert" for pre-trained weights.
         max_pos: int
             The maximum sequence length that this model will be used with.
         enc_layer: int
@@ -77,11 +66,6 @@ class BertAbsConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size_or_config_json_file=30522,
-        temp_dir=".",
-        finetune_bert=False,
-        large=False,
-        share_emb=True,
-        encoder="bert",
         max_pos=512,
         enc_layers=6,
         enc_hidden_size=512,
@@ -104,21 +88,15 @@ class BertAbsConfig(PretrainedConfig):
             for key, value in json_config.items():
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
-            self.temp_dir = temp_dir
-            self.finetune_bert = finetune_bert
-            self.large = large
             self.vocab_size = vocab_size_or_config_json_file
             self.max_pos = max_pos
 
-            self.encoder = encoder
             self.enc_layers = enc_layers
             self.enc_hidden_size = enc_hidden_size
             self.enc_heads = enc_heads
             self.enc_ff_size = enc_ff_size
             self.enc_dropout = enc_dropout
 
-            self.share_emb = share_emb
-
             self.dec_layers = dec_layers
             self.dec_hidden_size = dec_hidden_size
             self.dec_heads = dec_heads
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index 0189a2ad2b..5e51526037 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -53,7 +53,7 @@ class BertAbs(BertAbsPreTrainedModel):
     def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
         super(BertAbs, self).__init__(args)
         self.args = args
-        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)
+        self.bert = Bert()
 
         # If pre-trained weights are passed for Bert, load these.
         load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
@@ -69,18 +69,6 @@ class BertAbs(BertAbsPreTrainedModel):
                 strict=True,
             )
 
-        if args.encoder == "baseline":
-            bert_config = BertConfig(
-                self.bert.model.config.vocab_size,
-                hidden_size=args.enc_hidden_size,
-                num_hidden_layers=args.enc_layers,
-                num_attention_heads=8,
-                intermediate_size=args.enc_ff_size,
-                hidden_dropout_prob=args.enc_dropout,
-                attention_probs_dropout_prob=args.enc_dropout,
-            )
-            self.bert.model = BertModel(bert_config)
-
         self.vocab_size = self.bert.model.config.vocab_size
 
         if args.max_pos > 512:
@@ -101,10 +89,10 @@ class BertAbs(BertAbsPreTrainedModel):
         tgt_embeddings = nn.Embedding(
             self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
         )
-        if self.args.share_emb:
-            tgt_embeddings.weight = copy.deepcopy(
-                self.bert.model.embeddings.word_embeddings.weight
-            )
+
+        tgt_embeddings.weight = copy.deepcopy(
+            self.bert.model.embeddings.word_embeddings.weight
+        )
 
         self.decoder = TransformerDecoder(
             self.args.dec_layers,
@@ -141,16 +129,6 @@ class BertAbs(BertAbsPreTrainedModel):
             else:
                 p.data.zero_()
 
-    def maybe_tie_embeddings(self, args):
-        if args.use_bert_emb:
-            tgt_embeddings = nn.Embedding(
-                self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
-            )
-            tgt_embeddings.weight = copy.deepcopy(
-                self.bert.model.embeddings.word_embeddings.weight
-            )
-            self.decoder.embeddings = tgt_embeddings
-
     def forward(
         self,
         encoder_input_ids,
@@ -178,14 +156,9 @@ class Bert(nn.Module):
     """ This class is not really necessary and should probably disappear.
     """
 
-    def __init__(self, large, temp_dir, finetune=False):
+    def __init__(self):
         super(Bert, self).__init__()
-        if large:
-            self.model = BertModel.from_pretrained("bert-large-uncased", cache_dir=temp_dir)
-        else:
-            self.model = BertModel.from_pretrained("bert-base-uncased", cache_dir=temp_dir)
-
-        self.finetune = finetune
+        self.model = BertModel.from_pretrained("bert-base-uncased")
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
         self.eval()
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index bbc79227ca..ed663e880b 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -31,9 +31,9 @@ Batch = namedtuple(
 
 def evaluate(args):
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
-    model = bertabs = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
-    bertabs.to(args.device)
-    bertabs.eval()
+    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
+    model.to(args.device)
+    model.eval()
 
     symbols = {
         "BOS": tokenizer.vocab["[unused0]"],

From 5909f710285cf8164b3f51111d595ae87f847133 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 5 Dec 2019 21:07:49 +0100
Subject: [PATCH 59/91] add py-rouge dependency

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 4a3162adce..236ac1c430 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,5 @@ regex
 sentencepiece
 # For XLM
 sacremoses
+# For ROUGE
+py-rouge

From 076602bdc4b186e715538f437f2bce4b1ee5020e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 6 Dec 2019 10:11:44 +0100
Subject: [PATCH 60/91] prevent BERT weights from being downloaded twice

---
 examples/summarization/modeling_bertabs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index 5e51526037..efca33fb56 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -158,7 +158,8 @@ class Bert(nn.Module):
 
     def __init__(self):
         super(Bert, self).__init__()
-        self.model = BertModel.from_pretrained("bert-base-uncased")
+        config = BertConfig.from_pretrained("bert-base-uncased")
+        self.model = BertModel(config)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
         self.eval()

From ade3cdf5adfcff7736b326b1360fcf2b59aae47e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 6 Dec 2019 11:36:44 +0100
Subject: [PATCH 61/91] integrate ROUGE

---
 examples/summarization/modeling_bertabs.py  | 65 +---------------
 examples/summarization/run_summarization.py | 85 +++++++++++++++++++--
 requirements.txt                            |  1 +
 3 files changed, 82 insertions(+), 69 deletions(-)

diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index efca33fb56..57126a4df3 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -21,9 +21,6 @@
 # SOFTWARE.
 import copy
 import math
-import shutil
-import time
-import os
 
 import numpy as np
 import torch
@@ -1082,11 +1079,6 @@ class Translator(object):
 
         return translations
 
-    def _report_rouge(self, gold_path, can_path):
-        self.logger.info("Calculating Rouge")
-        results_dict = test_rouge(self.args.temp_dir, can_path, gold_path)
-        return results_dict
-
 
 def tile(x, count, dim=0):
     """
@@ -1113,63 +1105,10 @@ def tile(x, count, dim=0):
 
 
 #
-# All things ROUGE. Uses `pyrouge` which is a hot mess.
+# Optimizer for training. We keep this here in case we want to add
+# a finetuning script.
 #
 
-
-def test_rouge(temp_dir, cand, ref):
-    candidates = [line.strip() for line in open(cand, encoding="utf-8")]
-    references = [line.strip() for line in open(ref, encoding="utf-8")]
-    print(len(candidates))
-    print(len(references))
-    assert len(candidates) == len(references)
-
-    cnt = len(candidates)
-    current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
-    tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time))
-    if not os.path.isdir(tmp_dir):
-        os.mkdir(tmp_dir)
-        os.mkdir(tmp_dir + "/candidate")
-        os.mkdir(tmp_dir + "/reference")
-    try:
-
-        for i in range(cnt):
-            if len(references[i]) < 1:
-                continue
-            with open(
-                tmp_dir + "/candidate/cand.{}.txt".format(i), "w", encoding="utf-8"
-            ) as f:
-                f.write(candidates[i])
-            with open(
-                tmp_dir + "/reference/ref.{}.txt".format(i), "w", encoding="utf-8"
-            ) as f:
-                f.write(references[i])
-        r = pyrouge.Rouge155(temp_dir=temp_dir)
-        r.model_dir = tmp_dir + "/reference/"
-        r.system_dir = tmp_dir + "/candidate/"
-        r.model_filename_pattern = "ref.#ID#.txt"
-        r.system_filename_pattern = r"cand.(\d+).txt"
-        rouge_results = r.convert_and_evaluate()
-        print(rouge_results)
-        results_dict = r.output_to_dict(rouge_results)
-    finally:
-        pass
-        if os.path.isdir(tmp_dir):
-            shutil.rmtree(tmp_dir)
-    return results_dict
-
-
-def rouge_results_to_str(results_dict):
-    return ">> ROUGE-F(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\nROUGE-R(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\n".format(
-        results_dict["rouge_1_f_score"] * 100,
-        results_dict["rouge_2_f_score"] * 100,
-        results_dict["rouge_l_f_score"] * 100,
-        results_dict["rouge_1_recall"] * 100,
-        results_dict["rouge_2_recall"] * 100,
-        results_dict["rouge_l_recall"] * 100,
-    )
-
-
 class BertSumOptimizer(object):
     """ Specific optimizer for BertSum.
 
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index ed663e880b..a9d08aca82 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -41,6 +41,26 @@ def evaluate(args):
         "PAD": tokenizer.vocab["[PAD]"],
     }
 
+    if args.compute_rouge:
+        reference_summaries = []
+        generated_summaries = []
+
+        import rouge
+        import nltk
+        nltk.download('punkt')
+        rouge_evaluator = rouge.Rouge(
+            metrics=['rouge-n', 'rouge-l'],
+            max_n=2,
+            limit_length=True,
+            length_limit=args.beam_size,
+            length_limit_type='words',
+            apply_avg=True,
+            apply_best=False,
+            alpha=0.5,  # Default F1_score
+            weight_factor=1.2,
+            stemming=True,
+        )
+
     # these (unused) arguments are defined to keep the compatibility
     # with the legacy code and will be deleted in a next iteration.
     args.result_path = ""
@@ -66,6 +86,16 @@ def evaluate(args):
         summaries = [format_summary(t) for t in translations]
         save_summaries(summaries, args.summaries_output_dir, batch.document_names)
 
+        if args.compute_rouge:
+            reference_summaries += batch.tgt_str
+            generated_summaries += summaries
+
+    if args.compute_rouge:
+        scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
+        str_scores = format_rouge_scores(scores)
+        save_rouge_scores(str_scores)
+        print(str_scores)
+
 
 def format_summary(translation):
     """ Transforms the output of the `from_batch` function
@@ -86,6 +116,41 @@ def format_summary(translation):
     return summary
 
 
+def format_rouge_scores(scores):
+    return """\n
+****** ROUGE SCORES ******
+
+** ROUGE 1
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+
+** ROUGE 2
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+
+** ROUGE L
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}""".format(
+        scores['rouge-1']['f'],
+        scores['rouge-1']['p'],
+        scores['rouge-1']['r'],
+        scores['rouge-2']['f'],
+        scores['rouge-2']['p'],
+        scores['rouge-2']['r'],
+        scores['rouge-l']['f'],
+        scores['rouge-l']['p'],
+        scores['rouge-l']['r'],
+    )
+
+
+def save_rouge_scores(str_scores):
+    with open("rouge_scores.txt", "w") as output:
+        output.write(str_scores)
+
+
 def save_summaries(summaries, path, original_document_name):
     """ Write the summaries in fies that are prefixed by the original
     files' name with the `_summary` appended.
@@ -142,26 +207,27 @@ def collate(data, tokenizer, block_size):
     """
     data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
     names = [name for name, _, _ in data]
+    summaries = [" ".join(summary_list) for _, _, summary_list in data]
 
     encoded_text = [
         encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
     ]
-    stories = torch.tensor(
+    encoded_stories = torch.tensor(
         [
             fit_to_block_size(story, block_size, tokenizer.pad_token_id)
             for story, _ in encoded_text
         ]
     )
-    encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
-    encoder_mask = build_mask(stories, tokenizer.pad_token_id)
+    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
+    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
 
     batch = Batch(
         document_names=names,
-        batch_size=len(stories),
-        src=stories,
+        batch_size=len(encoded_stories),
+        src=encoded_stories,
         segs=encoder_token_type_ids,
         mask_src=encoder_mask,
-        tgt_str=[""] * len(stories),
+        tgt_str=summaries,
     )
 
     return batch
@@ -196,6 +262,13 @@ def main():
         required=False,
         help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
     )
+    parser.add_argument(
+        "--compute_rouge",
+        default=False,
+        type=bool,
+        required=False,
+        help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
+    )
     # EVALUATION options
     parser.add_argument(
         "--visible_gpus",
diff --git a/requirements.txt b/requirements.txt
index 236ac1c430..2cbcc3809d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ sentencepiece
 # For XLM
 sacremoses
 # For ROUGE
+nltk
 py-rouge

From c0707a85d24fa5a74d85d40ed704d4c774e9a37f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 6 Dec 2019 11:49:27 +0100
Subject: [PATCH 62/91] add README

---
 examples/README.md               | 17 ---------
 examples/summarization/README.md | 61 ++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 17 deletions(-)
 create mode 100644 examples/summarization/README.md

diff --git a/examples/README.md b/examples/README.md
index 3d0b2ca1a9..620304ea77 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -713,20 +713,3 @@ Training with the previously defined hyper-parameters yields the following resul
 ```bash
 acc = 0.7093812375249501
 ```
-
-### Abstractive Summarization
-
-This example provides a simple API for the [BertAbs](https://github.com/nlpyang/PreSumm) model finetuned on the CNN/DailyMail dataset. The script can be used to generate summaries from any text. 
-
-```bash
-python run_summarization.py \
-    --documents_dir 'path/to/documents' \
-    --summaries_output_dir 'path/to/summaries' \
-    --visible_gpus 0,1,2 \
-    --batch_size 4 \
-    --min_length 50 \
-    --max_length 200 \
-    --beam_size 5 \
-    --alpha 0.95 \
-    --block_trigram true
-```
diff --git a/examples/summarization/README.md b/examples/summarization/README.md
new file mode 100644
index 0000000000..2b58c00693
--- /dev/null
+++ b/examples/summarization/README.md
@@ -0,0 +1,61 @@
+# Text Summarization with Pretrained Encoders
+
+This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
+
+The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
+
+The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
+
+## Setup
+
+```
+git clone https://github.com/huggingface/transformers && cd transformers
+pip install [--editable] .
+pip install nltk py-rouge
+cd examples/summarization
+```
+
+## Reproduce the authors' results on ROUGE
+
+To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+
+And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --visible_gpus 0,1,2 \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+    --compute_rouge true
+```
+
+The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file.
+
+## Summarize any text
+
+Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --visible_gpus 0,1,2 \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+```
+
+If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py`

From 2a64107e44bd2bb1caee824f121fc4fb6b7d90f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 6 Dec 2019 15:45:09 +0100
Subject: [PATCH 63/91] improve device usage

---
 examples/summarization/README.md              |  8 +++----
 ...ert_bertabs_original_pytorch_checkpoint.py |  7 +++---
 examples/summarization/modeling_bertabs.py    |  2 --
 examples/summarization/run_summarization.py   | 23 +++++++++++--------
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/examples/summarization/README.md b/examples/summarization/README.md
index 2b58c00693..96825cfa46 100644
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p
 python run_summarization.py \
     --documents_dir $DATA_PATH \
     --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --visible_gpus 0,1,2 \
+    --to_cpu false \
     --batch_size 4 \
     --min_length 50 \
     --max_length 200 \
@@ -39,7 +39,7 @@ python run_summarization.py \
     --compute_rouge true
 ```
 
-The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file.
+The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
 
 ## Summarize any text
 
@@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic
 python run_summarization.py \
     --documents_dir $DATA_PATH \
     --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --visible_gpus 0,1,2 \
+    --to_cpu false \
     --batch_size 4 \
     --min_length 50 \
     --max_length 200 \
@@ -58,4 +58,4 @@ python run_summarization.py \
     --block_trigram true \
 ```
 
-If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py`
+You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
index 786a29ef13..33b17bfb6f 100644
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -12,10 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Convert BertExtAbs's checkpoints
+""" Convert BertExtAbs's checkpoints.
 
-The file currently does not do much as we ended up copying the exact model
-structure, but I leave it here in case we ever want to refactor the model.
+The script looks like it is doing something trivial but it is not. The "weights"
+proposed by the authors are actually the entire model pickled. We need to load
+the model within the original codebase to be able to only save its `state_dict`.
 """
 
 import argparse
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index 57126a4df3..d989e4fd7e 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -847,14 +847,12 @@ class Translator(object):
        global_scores (:obj:`GlobalScorer`):
          object to rescore final translations
        copy_attn (bool): use copy attention during translation
-       cuda (bool): use cuda
        beam_trace (bool): trace beam search for debugging
        logger(logging.Logger): logger.
     """
 
     def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None):
         self.logger = logger
-        self.cuda = args.visible_gpus != "-1"
 
         self.args = args
         self.model = model
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index a9d08aca82..c388569869 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -185,7 +185,7 @@ def save_summaries(summaries, path, original_document_name):
 def build_data_iterator(args, tokenizer):
     dataset = load_and_cache_examples(args, tokenizer)
     sampler = SequentialSampler(dataset)
-    collate_fn = lambda data: collate(data, tokenizer, block_size=512)
+    collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
     iterator = DataLoader(
         dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
     )
@@ -198,7 +198,7 @@ def load_and_cache_examples(args, tokenizer):
     return dataset
 
 
-def collate(data, tokenizer, block_size):
+def collate(data, tokenizer, block_size, device):
     """ Collate formats the data passed to the data loader.
 
     In particular we tokenize the data batch after batch to avoid keeping them
@@ -224,9 +224,9 @@ def collate(data, tokenizer, block_size):
     batch = Batch(
         document_names=names,
         batch_size=len(encoded_stories),
-        src=encoded_stories,
-        segs=encoder_token_type_ids,
-        mask_src=encoder_mask,
+        src=encoded_stories.to(device),
+        segs=encoder_token_type_ids.to(device),
+        mask_src=encoder_mask.to(device),
         tgt_str=summaries,
     )
 
@@ -271,10 +271,10 @@ def main():
     )
     # EVALUATION options
     parser.add_argument(
-        "--visible_gpus",
-        default=-1,
-        type=int,
-        help="Number of GPUs with which to do the training.",
+        "--to_cpu",
+        default=False,
+        type=bool,
+        help="Whether to force the execution on CPU.",
     )
     parser.add_argument(
         "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
@@ -311,8 +311,11 @@ def main():
         help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
     )
     args = parser.parse_args()
-    args.device = torch.device("cpu") if args.visible_gpus == -1 else torch.device("cuda")
 
+    # Select device (distibuted not available)
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.to_cpu else "cpu")
+
+    # Check the existence of directories
     if not args.summaries_output_dir:
         args.summaries_output_dir = args.documents_dir
 

From f7eba090077a443d4a2fd1cd341c822a8fb4dcbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 6 Dec 2019 22:01:48 +0100
Subject: [PATCH 64/91] clean for release

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 161 ------------------
 examples/summarization/modeling_bertabs.py    |   2 +-
 examples/summarization/requirements.txt       |   9 +
 examples/summarization/run_summarization.py   |  60 +++----
 requirements.txt                              |   3 -
 ..._original_pytorch_checkpoint_to_pytorch.py | 158 -----------------
 transformers/generate/__init__.py             |   1 -
 transformers/modeling_encoder_decoder.py      |  31 ++--
 8 files changed, 49 insertions(+), 376 deletions(-)
 delete mode 100644 examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 examples/summarization/requirements.txt
 delete mode 100644 transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 transformers/generate/__init__.py

diff --git a/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py b/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c245d0eae5..0000000000
--- a/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Convert BertExtAbs's checkpoints """
-
-import argparse
-from collections import namedtuple
-import logging
-import pdb
-import torch
-
-from models.model_builder import AbsSummarizer  # The authors' implementation
-from model_bertabs import BertAbsSummarizer
-
-from transformers import BertTokenizer
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-SAMPLE_TEXT = 'Hello world! cécé herlolip'
-
-
-BertAbsConfig = namedtuple(
-    "BertAbsConfig",
-    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
-)
-
-
-def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
-    """ Copy/paste and tweak the pre-trained weights provided by the creators
-    of BertAbs for the internal architecture.
-    """
-
-    # Instantiate the authors' model with the pre-trained weights
-    config = BertAbsConfig(
-        temp_dir=".",
-        finetune_bert=False,
-        large=False,
-        share_emb=True,
-        use_bert_emb=False,
-        encoder="bert",
-        max_pos=512,
-        enc_layers=6,
-        enc_hidden_size=512,
-        enc_heads=8,
-        enc_ff_size=512,
-        enc_dropout=0.2,
-        dec_layers=6,
-        dec_hidden_size=768,
-        dec_heads=8,
-        dec_ff_size=2048,
-        dec_dropout=0.2,
-    )
-    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
-    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
-    original.eval()
-
-    new_model = BertAbsSummarizer(config, torch.device("cpu"))
-    new_model.eval()
-
-    # -------------------
-    # Convert the weights
-    # -------------------
-
-    logging.info("convert the model")
-    new_model.encoder.load_state_dict(original.bert.state_dict())
-
-    new_model.decoder.generator.load_state_dict(original.generator.state_dict())
-    new_model.decoder.embeddings.load_state_dict(original.decoder.embeddings.state_dict())
-    new_model.decoder.pos_emb.load_state_dict(original.decoder.pos_emb.state_dict())
-    new_model.decoder.transformer_layers.load_state_dict(original.decoder.transformer_layers.state_dict())
-    new_model.decoder.layer_norm.load_state_dict(original.decoder.layer_norm.state_dict())
-
-    # ----------------------------------
-    # Make sure the outpus are identical
-    # ----------------------------------
-
-    logging.info("Make sure that the models' outputs are identical")
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-
-    # prepare the model inputs
-    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
-    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
-    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
-    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
-    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
-    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
-
-    # failsafe to make sure the weights reset does not affect the
-    # loaded weights.
-    assert torch.max(torch.abs(original.generator[0].weight - new_model.decoder.generator[0].weight)) == 0
-
-    # forward pass
-    src = encoder_input_ids
-    tgt = decoder_input_ids
-    segs = token_type_ids = None
-    clss = None
-    mask_src = encoder_attention_mask = None
-    mask_tgt = decoder_attention_mask = None
-    mask_cls = None
-    
-    # The original model does not apply the geneator layer immediatly but rather in
-    # the beam search (where it combines softmax + linear layer). Since we already
-    # apply the softmax in our generation process we only apply the linear layer here.
-    # We make sure that the outputs of the full stack are identical
-    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
-    output_original_model = original.generator(output_original_model)
-
-    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
-    output_converted_model = torch.nn.functional.log_softmax(output_converted_model, dim=-1)
-
-    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
-
-    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
-    if are_identical:
-        logging.info("all weights are equal up to 1e-3")
-    else:
-        raise ValueError("the weights are different. The new model is likely different from the original one.")
-
-    # The model has been saved with torch.save(model) and this is bound to the exact
-    # directory structure. We save the state_dict instead.
-    logging.info("saving the model's state dictionary")
-    torch.save(new_model.state_dict(), "bert-ext-abs.pt")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--bertabs_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
-    )
-    args = parser.parse_args()
-
-    convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path,
-        args.pytorch_dump_folder_path,
-    )
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index d989e4fd7e..5bf1599ad2 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -1,6 +1,6 @@
 # MIT License
 
-# Copyright (c) 2019 Yang Liu
+# Copyright (c) 2019 Yang Liu and the HuggingFace team
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
diff --git a/examples/summarization/requirements.txt b/examples/summarization/requirements.txt
new file mode 100644
index 0000000000..36d75a5edc
--- /dev/null
+++ b/examples/summarization/requirements.txt
@@ -0,0 +1,9 @@
+# progress bars in model download and training scripts
+tqdm
+# Accessing files from S3 directly.
+boto3
+# Used for downloading models over HTTP
+requests
+# For ROUGE
+nltk
+py-rouge
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index c388569869..f58ce3bb43 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -1,3 +1,4 @@
+#! /usr/bin/python3
 import argparse
 from collections import namedtuple
 import logging
@@ -97,6 +98,32 @@ def evaluate(args):
         print(str_scores)
 
 
+def save_summaries(summaries, path, original_document_name):
+    """ Write the summaries in fies that are prefixed by the original
+    files' name with the `_summary` appended.
+
+    Attributes:
+        original_document_names: List[string]
+            Name of the document that was summarized.
+        path: string
+            Path were the summaries will be written
+        summaries: List[string]
+            The summaries that we produced.
+    """
+    for summary, document_name in zip(summaries, original_document_name):
+        # Prepare the summary file's name
+        if "." in document_name:
+            bare_document_name = ".".join(document_name.split(".")[:-1])
+            extension = document_name.split(".")[-1]
+            name = bare_document_name + "_summary." + extension
+        else:
+            name = document_name + "_summary"
+
+        file_path = os.path.join(path, name)
+        with open(file_path, "w") as output:
+            output.write(summary)
+
+
 def format_summary(translation):
     """ Transforms the output of the `from_batch` function
     into nicely formatted summaries.
@@ -151,32 +178,6 @@ def save_rouge_scores(str_scores):
         output.write(str_scores)
 
 
-def save_summaries(summaries, path, original_document_name):
-    """ Write the summaries in fies that are prefixed by the original
-    files' name with the `_summary` appended.
-
-    Attributes:
-        original_document_names: List[string]
-            Name of the document that was summarized.
-        path: string
-            Path were the summaries will be written
-        summaries: List[string]
-            The summaries that we produced.
-    """
-    for summary, document_name in zip(summaries, original_document_name):
-        # Prepare the summary file's name
-        if "." in document_name:
-            bare_document_name = ".".join(document_name.split(".")[:-1])
-            extension = document_name.split(".")[-1]
-            name = bare_document_name + "_summary." + extension
-        else:
-            name = document_name + "_summary"
-
-        file_path = os.path.join(path, name)
-        with open(file_path, "w") as output:
-            output.write(summary)
-
-
 #
 # LOAD the dataset
 #
@@ -323,7 +324,7 @@ def main():
         raise FileNotFoundError(
             "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
         )
-    maybe_create_output_dir(args.summaries_output_dir)
+    os.makedirs(args.summaries_output_dir, exist_ok=True)
 
     evaluate(args)
 
@@ -339,10 +340,5 @@ def documents_dir_is_valid(path):
     return True
 
 
-def maybe_create_output_dir(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-
 if __name__ == "__main__":
     main()
diff --git a/requirements.txt b/requirements.txt
index 2cbcc3809d..4a3162adce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,3 @@ regex
 sentencepiece
 # For XLM
 sacremoses
-# For ROUGE
-nltk
-py-rouge
diff --git a/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4f158966e1..0000000000
--- a/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Convert BertExtAbs's checkpoints """
-
-import argparse
-from collections import namedtuple
-import logging
-
-import torch
-
-from models.model_builder import AbsSummarizer  # The authors' implementation
-
-from transformers import BertConfig, Model2Model, BertModel, BertForMaskedLM
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-BertExtAbsConfig = namedtuple(
-    "BertExtAbsConfig",
-    ["temp_dir", "large", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
-)
-
-
-def convert_bertextabs_checkpoints(path_to_checkpoints, dump_path):
-    """ Copy/paste and tweak the pre-trained weights provided by the creators
-    of BertExtAbs for the internal architecture.
-    """
-
-    # Load checkpoints in memory
-    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
-
-    # Instantiate the authors' model with the pre-trained weights
-    config = BertExtAbsConfig(
-        temp_dir=".",
-        finetune_bert=False,
-        large=False,
-        share_emb=True,
-        encoder="bert",
-        max_pos=512,
-        enc_layers=6,
-        enc_hidden_size=512,
-        enc_heads=8,
-        enc_ff_size=512,
-        enc_dropout=0.2,
-        dec_layers=6,
-        dec_hidden_size=768,
-        dec_heads=8,
-        dec_ff_size=2048,
-        dec_dropout=0.2,
-    )
-    bertextabs = AbsSummarizer(config, torch.device("cpu"), checkpoints)
-    bertextabs.eval()
-
-    # Instantiate our version of the model
-    decoder_config = BertConfig(
-        hidden_size=config.dec_hidden_size,
-        num_hidden_layers=config.dec_layers,
-        num_attention_heads=config.dec_heads,
-        intermediate_size=config.dec_ff_size,
-        hidden_dropout_prob=config.dec_dropout,
-        attention_probs_dropout_prob=config.dec_dropout,
-        is_decoder=True,
-    )
-
-    decoder_model = BertForMaskedLM(decoder_config)
-    model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder_model)
-    model.eval()
-
-    # Let us now start the weight copying process
-    model.encoder.load_state_dict(bertextabs.bert.model.state_dict())
-
-    # Decoder
-
-    # Embeddings. The positional embeddings are equal to the word embedding plus a modulation
-    # that is computed at each forward pass. This may be a source of discrepancy.
-    model.decoder.bert.embeddings.word_embeddings.weight = bertextabs.decoder.embeddings.weight
-    model.decoder.bert.embeddings.position_embeddings.weight = bertextabs.decoder.embeddings.weight
-    model.decoder.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(bertextabs.decoder.embeddings.weight)  # not defined for BertExtAbs decoder
-
-    # In the original code the LayerNorms are applied twice in the layers, at the beginning and between the
-    # attention layers.
-    model.decoder.bert.embeddings.LayerNorm.weight = bertextabs.decoder.transformer_layers[0].layer_norm_1.weight
-
-    for i in range(config.dec_layers):
-
-        # self attention
-        model.decoder.bert.encoder.layer[i].attention.self.query.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_query.weight
-        model.decoder.bert.encoder.layer[i].attention.self.key.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_keys.weight
-        model.decoder.bert.encoder.layer[i].attention.self.value.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_values.weight
-        model.decoder.bert.encoder.layer[i].attention.output.dense.weight = bertextabs.decoder.transformer_layers[i].self_attn.final_linear.weight
-        model.decoder.bert.encoder.layer[i].attention.output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i].layer_norm_2.weight
-
-        # attention
-        model.decoder.bert.encoder.layer[i].crossattention.self.query.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_query.weight
-        model.decoder.bert.encoder.layer[i].crossattention.self.key.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_keys.weight
-        model.decoder.bert.encoder.layer[i].crossattention.self.value.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_values.weight
-        model.decoder.bert.encoder.layer[i].crossattention.output.dense.weight = bertextabs.decoder.transformer_layers[i].context_attn.final_linear.weight
-        model.decoder.bert.encoder.layer[i].crossattention.output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i].feed_forward.layer_norm.weight
-
-        # intermediate
-        model.decoder.bert.encoder.layer[i].intermediate.dense.weight = bertextabs.decoder.transformer_layers[i].feed_forward.w_1.weight
-
-        # output
-        model.decoder.bert.encoder.layer[i].output.dense.weight = bertextabs.decoder.transformer_layers[i].feed_forward.w_2.weight
-
-        try:
-            model.decoder.bert.encoder.layer[i].output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i + 1].layer_norm_1.weight
-        except IndexError:
-            model.decoder.bert.encoder.layer[i].output.LayerNorm.weight = bertextabs.decoder.layer_norm.weight
-
-    # LM Head
-    """
-    model.decoder.cls.predictions.transform.dense.weight
-    model.decoder.cls.predictions.transform.dense.biais
-    model.decoder.cls.predictions.transform.LayerNorm.weight
-    model.decoder.cls.predictions.transform.LayerNorm.biais
-    model.decoder.cls.predictions.decoder.weight
-    model.decoder.cls.predictions.decoder.biais
-    model.decoder.cls.predictions.biais.data
-    """
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--bertextabs_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
-    )
-    args = parser.parse_args()
-
-    convert_bertextabs_checkpoints(
-        args.bertextabs_checkpoint_path,
-        args.pytorch_dump_folder_path,
-    )
diff --git a/transformers/generate/__init__.py b/transformers/generate/__init__.py
deleted file mode 100644
index 21ac612155..0000000000
--- a/transformers/generate/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .beam_search import BeamSearch
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index 73322101d3..a884abd0a2 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -117,7 +117,8 @@ class PreTrainedEncoderDecoder(nn.Module):
         kwargs_common = {
             argument: value
             for argument, value in kwargs.items()
-            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
+            if not argument.startswith("encoder_")
+            and not argument.startswith("decoder_")
         }
         kwargs_decoder = kwargs_common.copy()
         kwargs_encoder = kwargs_common.copy()
@@ -157,27 +158,14 @@ class PreTrainedEncoderDecoder(nn.Module):
 
         return model
 
-    def save_pretrained(self, save_directory, model_type="bert"):
-        """ Save an EncoderDecoder model and its configuration file in a format such
+    def save_pretrained(self, save_directory):
+        """ Save a Seq2Seq model and its configuration file in a format such
         that it can be loaded using `:func:`~transformers.PreTrainedEncoderDecoder.from_pretrained`
 
         We save the encoder' and decoder's parameters in two separate directories.
-
-        If we want the weight loader to function we need to preprend the model
-        type to the directories' names. As far as I know there is no simple way
-        to infer the type of the model (except maybe by parsing the class'
-        names, which is not very future-proof). For now, we ask the user to
-        specify the model type explicitly when saving the weights.
         """
-        encoder_path = os.path.join(save_directory, "{}_encoder".format(model_type))
-        if not os.path.exists(encoder_path):
-            os.makedirs(encoder_path)
-        self.encoder.save_pretrained(encoder_path)
-
-        decoder_path = os.path.join(save_directory, "{}_decoder".format(model_type))
-        if not os.path.exists(decoder_path):
-            os.makedirs(decoder_path)
-        self.decoder.save_pretrained(decoder_path)
+        self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
+        self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
 
     def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
         """ The forward pass on a seq2eq depends what we are performing:
@@ -205,7 +193,8 @@ class PreTrainedEncoderDecoder(nn.Module):
         kwargs_common = {
             argument: value
             for argument, value in kwargs.items()
-            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
+            if not argument.startswith("encoder_")
+            and not argument.startswith("decoder_")
         }
         kwargs_decoder = kwargs_common.copy()
         kwargs_encoder = kwargs_common.copy()
@@ -228,7 +217,9 @@ class PreTrainedEncoderDecoder(nn.Module):
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]  # output the last layer hidden state
+            encoder_hidden_states = encoder_outputs[
+                0
+            ]  # output the last layer hidden state
         else:
             encoder_outputs = ()
 

From 1d189304624db17749aee23fa2345f009cc48215 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 01:32:42 +0000
Subject: [PATCH 65/91] Harmonize `no_cuda` flag with other scripts

---
 examples/summarization/run_summarization.py | 4 ++--
 requirements.txt                            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index f58ce3bb43..3c339d0c30 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -272,7 +272,7 @@ def main():
     )
     # EVALUATION options
     parser.add_argument(
-        "--to_cpu",
+        "--no_cuda",
         default=False,
         type=bool,
         help="Whether to force the execution on CPU.",
@@ -314,7 +314,7 @@ def main():
     args = parser.parse_args()
 
     # Select device (distibuted not available)
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.to_cpu else "cpu")
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
 
     # Check the existence of directories
     if not args.summaries_output_dir:
diff --git a/requirements.txt b/requirements.txt
index 4a3162adce..9c43abc6d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,4 @@ regex
 # For XLNet
 sentencepiece
 # For XLM
-sacremoses
+sacremoses
\ No newline at end of file

From 4b82c485de187896a38c441587b7bd4d04f2821e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 10 Dec 2019 14:49:53 +0100
Subject: [PATCH 66/91] remove misplaced summarization documentation

---
 examples/README.md | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 620304ea77..b6b3908810 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -24,8 +24,6 @@ pip install -r ./examples/requirements.txt
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs
-model finetuned on the CNN/DailyMail dataset to generate summaries. |
 
 ## TensorFlow 2.0 Bert models on GLUE
 
@@ -646,34 +644,6 @@ micro avg     0.8722    0.8774    0.8748     13869
 macro avg     0.8712    0.8774    0.8740     13869
 ```
 
-## Abstractive summarization
-
-Based on the script
-[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
-
-Before running this script you should download **both** CNN and Daily Mail
-datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/)  (the
-links next to "Stories") in the same folder. Then uncompress the archives by running:
-
-```bash
-tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
-```
-
-note that the finetuning script **will not work** if you do not download both
-datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
-archive.
-
-```bash
-export DATA_PATH=/path/to/dataset/
-
-python run_summarization_finetuning.py \
-    --output_dir=output \
-    --model_type=bert2bert \
-    --model_name_or_path=bert2bert \
-    --do_train \
-    --data_path=$DATA_PATH \
-```
-
 ## XNLI
 
 Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).

From dc4e9e5cb36ae9bf5185b49b1cbc9106857abd54 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 10 Dec 2019 19:21:20 +0000
Subject: [PATCH 67/91] DataParallel for SQuAD + fix XLM

---
 examples/run_squad.py                      | 6 +++++-
 transformers/data/metrics/squad_metrics.py | 7 ++++++-
 transformers/tokenization_xlm.py           | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 2df29014ef..5e3f9663e2 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -299,10 +299,14 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # XLNet and XLM use a more complex post-processing procedure
     if args.model_type in ['xlnet', 'xlm']:
+
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
+
         predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file,
-                        model.config.start_n_top, model.config.end_n_top,
+                        start_n_top, end_n_top,
                         args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
         predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index 0755c0ab7a..7b03255f49 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -695,7 +695,12 @@ def compute_predictions_log_probs(
             tok_text = " ".join(tok_text.split())
             orig_text = " ".join(orig_tokens)
 
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case,
                                         verbose_logging)
 
             if final_text in seen_predictions:
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 6c9f8e5e5c..8def80bec4 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
                                            additional_special_tokens=additional_special_tokens,
                                            **kwargs)
 
+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
+
         # cache of sm.MosesPunctNormalizer instance
         self.cache_moses_punct_normalizer = dict()
         # cache of sm.MosesTokenizer instance

From 6a73382706ce3c6905023872f63a680f0eb419a4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 10 Dec 2019 14:33:24 -0500
Subject: [PATCH 68/91] Complete warning + cleanup

---
 examples/run_squad.py              | 1 -
 transformers/tokenization_utils.py | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 5e3f9663e2..79c8537a4b 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -299,7 +299,6 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # XLNet and XLM use a more complex post-processing procedure
     if args.model_type in ['xlnet', 'xlm']:
-
         start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
         end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
 
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index f4395cd82c..cb931b0eaf 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object):
 
         self.max_len = max_len if max_len is not None else int(1e12)
 
-        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
         self.padding_side = kwargs.pop('padding_side', self.padding_side)
         
         # Added tokens
@@ -1003,7 +1003,7 @@ class PreTrainedTokenizer(object):
         )
 
         if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as the maximum  ")
+            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
 
         if needs_to_be_padded:
             difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])

From 58d75aa310e872723ba92ee1f0cb575ae9e2eaef Mon Sep 17 00:00:00 2001
From: Leo Dirac <deepembedding@gmail.com>
Date: Tue, 10 Dec 2019 11:36:56 -0800
Subject: [PATCH 69/91] Progress indicator improvements when downloading
 pre-trained models.

---
 transformers/file_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 24abd60781..68de4e6e2f 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -21,7 +21,7 @@ import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
 import requests
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from contextlib import contextmanager
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -245,7 +245,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
         return
     content_length = response.headers.get('Content-Length')
     total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total, initial=resume_size)
+    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading")
     for chunk in response.iter_content(chunk_size=1024):
         if chunk: # filter out keep-alive new chunks
             progress.update(len(chunk))

From 4c12860f7ae61659aed2675498350a386fc4e122 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 11 Dec 2019 09:22:37 -0500
Subject: [PATCH 70/91] Remove misleading documentation

---
 transformers/tokenization_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index cb931b0eaf..68a767fe82 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -628,7 +628,6 @@ class PreTrainedTokenizer(object):
             Take care of added tokens.
 
             text: The sequence to be encoded.
-            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
             **kwargs: passed to the child `self.tokenize()` method
         """
         def lowercase_text(t):

From 2e2f9fed554bb5f147ea3d9573004b447dd7c9e7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 11:11:56 -0500
Subject: [PATCH 71/91] rm duplicate imports

---
 transformers/modeling_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index b63e43d73b..6ba1aab7a3 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -28,7 +28,6 @@ from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassifica
 from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
 from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
 from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
 
 from .modeling_utils import PreTrainedModel, SequenceSummary

From 29570db25ba9dd30e5ac9be68dbcad95434964ec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 11 Dec 2019 17:19:18 +0100
Subject: [PATCH 72/91] allowing from_pretrained to load from url directly

---
 transformers/modeling_tf_utils.py | 4 +++-
 transformers/modeling_utils.py    | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index ed8fdb74c9..e7512b5bd6 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -259,8 +259,10 @@ class TFPreTrainedModel(tf.keras.Model):
                         pretrained_model_name_or_path))
             elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                archive_file = pretrained_model_name_or_path + ".index"
             else:
-                raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
+                archive_file = pretrained_model_name_or_path
 
             # redirect to the cache, if necessary
             try:
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 3ac568771e..9e7ca8d689 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -365,9 +365,12 @@ class PreTrainedModel(nn.Module):
                         pretrained_model_name_or_path))
             elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
-            else:
-                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index")
                 archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = pretrained_model_name_or_path
 
             # redirect to the cache, if necessary
             try:

From 030faccb8d45be9bdd2b4b80ff26f36dc41f622a Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Wed, 11 Dec 2019 17:44:21 +0100
Subject: [PATCH 73/91] doc: fix pretrained models table

---
 docs/source/pretrained_models.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index dd61f11769..2fe1f8a314 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -169,35 +169,35 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model                                                                                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model                                                                                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model                                                                                                                 |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model                                                                                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 

From c999a3e5050f1dc93d814abf352f3bf0c06572e7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 12:29:58 -0500
Subject: [PATCH 74/91] Allow from_pretrained to take a remote identifier

---
 transformers/configuration_utils.py |  8 +++++---
 transformers/file_utils.py          | 20 ++++++++++++++++----
 transformers/modeling_utils.py      |  8 +++++---
 transformers/tokenization_utils.py  | 10 +++++-----
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index 08cee75d81..8ae30f2a48 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -24,7 +24,7 @@ import logging
 import os
 from io import open
 
-from .file_utils import cached_path, CONFIG_NAME
+from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
 
 logger = logging.getLogger(__name__)
 
@@ -131,8 +131,10 @@ class PretrainedConfig(object):
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
         elif os.path.isdir(pretrained_model_name_or_path):
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        else:
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
+        else:
+            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
@@ -187,7 +189,7 @@ class PretrainedConfig(object):
 
     @classmethod
     def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
+        """Constructs a `Config` from a json file of parameters."""
         with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 68de4e6e2f..5fd5e2ee39 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -73,6 +73,8 @@ TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
 
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+
 def is_torch_available():
     return _torch_available
 
@@ -103,6 +105,18 @@ else:
             return fn
         return docstring_decorator
 
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ('http', 'https', 's3')
+
+def hf_bucket_url(identifier, postfix=None):
+    if postfix is None:
+        return "/".join((S3_BUCKET_PREFIX, identifier))
+    else:
+        return "/".join((S3_BUCKET_PREFIX, identifier, postfix))
+
+
 def url_to_filename(url, etag=None):
     """
     Convert `url` into a hashed filename in a repeatable way.
@@ -171,9 +185,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
+    if is_remote_url(url_or_filename):
         # URL, so get it from the cache (downloading if necessary)
         return get_from_cache(url_or_filename, cache_dir=cache_dir,
             force_download=force_download, proxies=proxies,
@@ -181,7 +193,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
-    elif parsed.scheme == '':
+    elif urlparse(url_or_filename).scheme == '':
         # File, but it doesn't exist.
         raise EnvironmentError("file {} not found".format(url_or_filename))
     else:
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 9e7ca8d689..eac4252336 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+                         cached_path, hf_bucket_url, is_remote_url)
 
 logger = logging.getLogger(__name__)
 
@@ -363,14 +364,15 @@ class PreTrainedModel(nn.Module):
                     raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                         pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                 assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                     pretrained_model_name_or_path + ".index")
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
-                archive_file = pretrained_model_name_or_path
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
+                # todo do we want to support TF checkpoints here?
 
             # redirect to the cache, if necessary
             try:
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 68a767fe82..2b2cec0c15 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -25,7 +25,7 @@ import itertools
 import re
 from io import open
 
-from .file_utils import cached_path, is_tf_available, is_torch_available
+from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
 
 if is_tf_available():
     import tensorflow as tf
@@ -327,12 +327,12 @@ class PreTrainedTokenizer(object):
                 if os.path.isdir(pretrained_model_name_or_path):
                     # If a directory is provided we look for the standard filenames
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                else:
+                elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                     # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                     full_file_name = pretrained_model_name_or_path
-                if not os.path.exists(full_file_name):
-                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
-                    full_file_name = None
+                else:
+                    full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
+                
                 vocab_files[file_id] = full_file_name
 
             # Look for the additional tokens files

From 3d57c51111054adb01b2ea94bfd45237eb282431 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 11 Dec 2019 15:10:17 -0500
Subject: [PATCH 75/91] Fix encode plus

---
 transformers/tokenization_utils.py | 39 ++++++++++++++++++------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 68a767fe82..eace409555 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -916,7 +916,7 @@ class PreTrainedTokenizer(object):
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
             return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
             return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
 
@@ -961,24 +961,13 @@ class PreTrainedTokenizer(object):
         if add_special_tokens:
             sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
         else:
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
-            special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
+
         if return_special_tokens_mask:
             encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
 
-        # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
-            sequence = tf.constant([sequence])
-            token_type_ids = tf.constant([token_type_ids])
-        elif return_tensors == 'pt' and is_torch_available():
-            sequence = torch.tensor([sequence])
-            token_type_ids = torch.tensor([token_type_ids])
-        elif return_tensors is not None:
-            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
-
         encoded_inputs["input_ids"] = sequence
         if return_token_type_ids:
             encoded_inputs["token_type_ids"] = token_type_ids
@@ -1015,10 +1004,9 @@ class PreTrainedTokenizer(object):
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-
             elif self.padding_side == 'left':
                 if return_attention_mask:
-                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
                     encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
                 if return_special_tokens_mask:
@@ -1030,7 +1018,26 @@ class PreTrainedTokenizer(object):
             
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
-            
+
+        # Prepare inputs as tensors if asked
+        if return_tensors == 'tf' and is_tf_available():
+            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+
+        elif return_tensors == 'pt' and is_torch_available():
+            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+        elif return_tensors is not None:
+            logger.warning(
+                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                    return_tensors))
+
         return encoded_inputs
 
     def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):

From 31e5b5ff2276c61af7eebb4c353934f8f675d728 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 15:22:02 -0500
Subject: [PATCH 76/91] Fix tests + first example of doc

---
 transformers/tokenization_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 2b2cec0c15..63d2cc5cb4 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``.
                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
             # Download vocabulary from S3 and cache.
             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased')
+
             # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
             tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
 
@@ -327,6 +331,9 @@ class PreTrainedTokenizer(object):
                 if os.path.isdir(pretrained_model_name_or_path):
                     # If a directory is provided we look for the standard filenames
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                    if not os.path.exists(full_file_name):
+                        logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        full_file_name = None
                 elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                     # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                     full_file_name = pretrained_model_name_or_path

From 18e1f751f1d996c4fe01559ade1cd013186b81e4 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 17:07:46 -0500
Subject: [PATCH 77/91] TF support

---
 transformers/modeling_tf_utils.py | 9 ++++++---
 transformers/modeling_utils.py    | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index e7512b5bd6..4a6d18f447 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -24,7 +24,8 @@ import os
 import tensorflow as tf
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+                         cached_path, hf_bucket_url, is_remote_url)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 logger = logging.getLogger(__name__)
@@ -257,12 +258,14 @@ class TFPreTrainedModel(tf.keras.Model):
                     raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
                         pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
-                archive_file = pretrained_model_name_or_path
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
+                if from_pt:
+                    raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
 
             # redirect to the cache, if necessary
             try:
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index eac4252336..37088f8e67 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -372,7 +372,8 @@ class PreTrainedModel(nn.Module):
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
                 archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
-                # todo do we want to support TF checkpoints here?
+                if from_tf:
+                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
 
             # redirect to the cache, if necessary
             try:

From 4f15e5a267201f86bdd9628cf58592d0e1cc86eb Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 17:41:51 -0500
Subject: [PATCH 78/91] Add tests.

Maybe not the best possible place for the tests, lmk.
---
 transformers/tests/modeling_auto_test.py     | 7 ++++++-
 transformers/tests/modeling_tf_auto_test.py  | 7 ++++++-
 transformers/tests/tokenization_auto_test.py | 7 ++++++-
 transformers/tests/utils.py                  | 3 +++
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 9b7d920bc8..871a262fe8 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -22,7 +22,7 @@ import logging
 
 from transformers import is_torch_available
 
-from .utils import require_torch, slow
+from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
 
 if is_torch_available():
     from transformers import (AutoConfig, BertConfig,
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForQuestionAnswering)
 
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 7ea48015d9..7ab6eaa3d6 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -22,7 +22,7 @@ import logging
 
 from transformers import is_tf_available
 
-from .utils import require_tf, slow
+from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 
 if is_tf_available():
     from transformers import (AutoConfig, BertConfig,
@@ -93,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForQuestionAnswering)
 
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
+        self.assertIsInstance(model, TFBertForMaskedLM)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 18346d2768..0a894cac04 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -23,7 +23,7 @@ import logging
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from .utils import slow
+from .utils import slow, SMALL_MODEL_IDENTIFIER
 
 
 class AutoTokenizerTest(unittest.TestCase):
@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
             self.assertIsInstance(tokenizer, GPT2Tokenizer)
             self.assertGreater(len(tokenizer), 0)
 
+    def test_tokenizer_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(tokenizer, BertTokenizer)
+        self.assertEqual(len(tokenizer), 12)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index 7a51ab612b..3aff1daf83 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -6,6 +6,9 @@ from distutils.util import strtobool
 from transformers.file_utils import _tf_available, _torch_available
 
 
+SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
+
+
 try:
     run_slow = os.environ["RUN_SLOW"]
 except KeyError:

From c03c0dfd230a5174c536a58d6ba5e590ed1afcc4 Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Fri, 15 Nov 2019 17:24:56 +0900
Subject: [PATCH 79/91] Add support for Japanese BERT models by cl-tohoku

---
 docs/source/pretrained_models.rst          |  18 ++
 transformers/__init__.py                   |   1 +
 transformers/configuration_bert.py         |   4 +
 transformers/modeling_bert.py              |   8 +-
 transformers/modeling_tf_bert.py           |  16 +-
 transformers/tokenization_auto.py          |   3 +
 transformers/tokenization_bert_japanese.py | 247 +++++++++++++++++++++
 7 files changed, 289 insertions(+), 8 deletions(-)
 create mode 100644 transformers/tokenization_bert_japanese.py

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 2fe1f8a314..d3498e057d 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                 |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                        |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                 |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
diff --git a/transformers/__init__.py b/transformers/__init__.py
index f9a28add5f..5d7b0b772c 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -37,6 +37,7 @@ if is_sklearn_available():
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index d63be963eb..16f1f60404 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json",
+    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json",
+    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json",
+    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json"
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index d84b0a1a7c..e2e115a015 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin",
+    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin",
+    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
 }
 
 
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
         input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
         input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
         start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
         print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
         # a nice puppet
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 5aa7bb3da2..27dd311a4d 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5",
+    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5",
+    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5",
+    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5"
 }
 
 
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
                 linear tensor, float32 with shape [batch_size, length, vocab_size].
         Raises:
             ValueError: if mode is not valid.
-        
+
         Shared weights logic adapted from
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
             input_shape = shape_list(input_ids)
         else:
             input_shape = shape_list(inputs_embeds)[:-1]
-        
+
         seq_length = input_shape[1]
         if position_ids is None:
             position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer, 
+        context_layer = tf.reshape(context_layer,
                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
 
         outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index b7c5046961..d63b7e783d 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 
 from .tokenization_bert import BertTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
@@ -118,6 +119,8 @@ class AutoTokenizer(object):
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert-japanese' in pretrained_model_name_or_path:
+            return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'openai-gpt' in pretrained_model_name_or_path:
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
new file mode 100644
index 0000000000..8554a1c880
--- /dev/null
+++ b/transformers/tokenization_bert_japanese.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'bert-base-japanese': 512,
+    'bert-base-japanese-whole-word-masking': 512,
+    'bert-base-japanese-char': 512,
+    'bert-base-japanese-char-whole-word-masking': 512
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-japanese': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-whole-word-masking':{
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-char': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    },
+    'bert-base-japanese-char-whole-word-masking': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    }
+}
+
+
+class BertJapaneseTokenizer(BertTokenizer):
+    """BERT tokenizer for Japanese text"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=False,
+                 do_word_tokenize=True, do_subword_tokenize=True,
+                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
+                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
+                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+        """Constructs a MecabBertTokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+                Only has an effect when do_basic_tokenize=True.
+            **do_word_tokenize**: (`optional`) boolean (default True)
+                Whether to do word tokenization.
+            **do_subword_tokenize**: (`optional`) boolean (default True)
+                Whether to do subword tokenization.
+            **word_tokenizer_type**: (`optional`) string (default "basic")
+                Type of word tokenizer.
+            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
+                Type of subword tokenizer.
+        """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+
+        self.do_word_tokenize = do_word_tokenize
+        if do_word_tokenize:
+            if word_tokenizer_type == 'basic':
+                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split,
+                                                     tokenize_chinese_chars=False)
+            elif word_tokenizer_type == 'mecab':
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split)
+            else:
+                raise ValueError(
+                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+
+        self.do_subword_tokenize = do_subword_tokenize
+        if do_subword_tokenize:
+            if subword_tokenizer_type == 'wordpiece':
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            elif subword_tokenizer_type == 'character':
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            else:
+                raise ValueError(
+                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
+
+
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            tokens = self.word_tokenizer.tokenize(text,
+                                                  never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens
+                            for sub_token in self.subword_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+
+        return split_tokens
+
+
+class MecabTokenizer(object):
+    """Runs basic tokenization with MeCab morphological parser."""
+
+    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
+        """Constructs a MecabTokenizer.
+
+        Args:
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+
+        import MeCab
+        self.mecab = MeCab.Tagger()
+
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+
+        cursor = 0
+        for line in self.mecab.parse(text).split('\n'):
+            if line == 'EOS':
+                break
+
+            token, _ = line.split('\t')
+            token_start = text.index(token, cursor)
+            token_end = token_start + len(token)
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+
+            tokens.append(token)
+            cursor = token_end
+
+        return tokens
+
+
+class CharacterTokenizer(object):
+    """Runs Character tokenziation."""
+
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """Constructs a CharacterTokenizer.
+
+        Args:
+            **vocab**:
+                Vocabulary object.
+            **unk_token**: str
+                A special symbol for out-of-vocabulary token.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into characters.
+
+        For example:
+            input = "apple"
+            output = ["a", "p", "p", "l", "e"]
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through `BasicTokenizer`.
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        output_tokens = []
+        for i, char in enumerate(text):
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+
+            output_tokens.append(char)
+
+        return output_tokens

From 57b5cb3eaa850a212235fccbd4e5d002aede72b6 Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Wed, 20 Nov 2019 09:02:10 +0900
Subject: [PATCH 80/91] Fix loading BertJapaneseTokenizer

---
 transformers/tokenization_auto.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index d63b7e783d..f36a584521 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -73,6 +73,7 @@ class AutoTokenizer(object):
             - contains `albert`: AlbertTokenizer (ALBERT model)
             - contains `camembert`: CamembertTokenizer (CamemBERT model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
+            - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
@@ -119,7 +120,7 @@ class AutoTokenizer(object):
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'bert-japanese' in pretrained_model_name_or_path:
+        elif 'bert-base-japanese' in pretrained_model_name_or_path:
             return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

From a09da4eeb0397dd66d61182177dd3b753d70e62a Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Fri, 29 Nov 2019 19:24:43 +0900
Subject: [PATCH 81/91] Add a test for Japanese BERT tokenizers

---
 .../tests/tokenization_bert_japanese_test.py  | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 transformers/tests/tokenization_bert_japanese_test.py

diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
new file mode 100644
index 0000000000..6f66b96411
--- /dev/null
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import pytest
+from io import open
+
+from transformers.tokenization_bert import WordpieceTokenizer
+from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
+                                                     MecabTokenizer, CharacterTokenizer,
+                                                     VOCAB_FILES_NAMES)
+
+from .tokenization_tests_commons import CommonTestCases
+
+
+class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertJapaneseTokenizer
+
+    def setUp(self):
+        super(BertJapaneseTokenizationTest, self).setUp()
+
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
+            u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens,
+                             [u"こんにちは", u"、", u"世界", u"。",
+                              u"こん", u"##ばんは", u"、", u"世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
+    def test_mecab_tokenizer(self):
+        tokenizer = MecabTokenizer()
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iPhone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+
+    def test_mecab_tokenizer_lower(self):
+        tokenizer = MecabTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iphone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+
+    def test_mecab_tokenizer_no_normalize(self):
+        tokenizer = MecabTokenizer(normalize_text=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"ｱｯﾌﾟﾙストア", u"で", u"iPhone", u"８", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"　", u"。"])
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こんにちは"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
+                             [u"こん", u"##ばんは"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
+                             [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
+
+    @pytest.mark.slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
+
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertJapaneseTokenizer
+
+    def setUp(self):
+        super(BertJapaneseCharacterTokenizationTest, self).setUp()
+
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
+                                                     subword_tokenizer_type="character",
+                                                     **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file,
+                                         subword_tokenizer_type="character")
+
+        tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(tokens,
+            [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
+             u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 4, 5, 6, 7, 11, 9, 10, 12,
+                              3, 4, 8, 4, 7, 11, 9, 10, 12])
+
+    def test_character_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こ", u"ん", u"に", u"ち", u"は"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
+                             [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
+
+    @pytest.mark.slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
+
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6a43dc9d7d592362d144209097e1d93876f8e88a Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Thu, 5 Dec 2019 11:19:02 +0900
Subject: [PATCH 82/91] Support Python 2

---
 transformers/tokenization_bert_japanese.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 8554a1c880..1ce0e1d1cb 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import collections
 import logging
 import os
+import six
 import unicodedata
 from io import open
 
@@ -186,8 +187,13 @@ class MecabTokenizer(object):
         never_split = self.never_split + (never_split if never_split is not None else [])
         tokens = []
 
+        if six.PY2:
+            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+        else:
+            mecab_output = self.mecab.parse(text)
+
         cursor = 0
-        for line in self.mecab.parse(text).split('\n'):
+        for line in mecab_output.split('\n'):
             if line == 'EOS':
                 break
 

From 597ba7feb384316081c96955196fcb7abb2edf06 Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Thu, 5 Dec 2019 11:30:40 +0900
Subject: [PATCH 83/91] Support testing Japanese BERT tokenizers

---
 .circleci/config.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 01e6d82b33..97f5f25606 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,6 +13,8 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py3_torch:
@@ -27,6 +29,8 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
@@ -42,6 +46,8 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_torch:
@@ -55,6 +61,8 @@ jobs:
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_tf:
@@ -68,6 +76,8 @@ jobs:
             - run: sudo pip install tensorflow
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     deploy_doc:

From d2100428d3652cefbffcf0bd00f0881090d26333 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:43:49 +0000
Subject: [PATCH 84/91] Update to new test infra and only run conditionally

---
 .circleci/config.yml                          | 20 ++++-----
 .../tests/tokenization_bert_japanese_test.py  |  9 ++--
 transformers/tests/utils.py                   | 42 +++++++++++++------
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 97f5f25606..7ca5f8121c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,8 +13,6 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py3_torch:
@@ -29,8 +27,6 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
@@ -46,8 +42,6 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_torch:
@@ -61,8 +55,6 @@ jobs:
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_tf:
@@ -76,10 +68,18 @@ jobs:
             - run: sudo pip install tensorflow
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
+    build_py3_custom_tokenizers:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest
+            - run: sudo pip install mecab-python3
+            - run: python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:
         working_directory: ~/transformers
         docker:
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
index 6f66b96411..545193c7cc 100644
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_bert import WordpieceTokenizer
@@ -25,8 +24,10 @@ from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
                                                      VOCAB_FILES_NAMES)
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow, custom_tokenizers
 
 
+@custom_tokenizers
 class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = BertJapaneseTokenizer
@@ -104,7 +105,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
                              [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
 
@@ -172,7 +173,7 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
         self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
                              [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
 
@@ -188,5 +189,3 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
 
 
 
-if __name__ == '__main__':
-    unittest.main()
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index 7a51ab612b..2b97293ca7 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -6,18 +6,23 @@ from distutils.util import strtobool
 from transformers.file_utils import _tf_available, _torch_available
 
 
-try:
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
+def parse_flag_from_env(key, default=False):
     try:
-        _run_slow_tests = strtobool(run_slow)
-    except ValueError:
-        # More values are supported, but let's keep the message simple.
-        raise ValueError("If set, RUN_SLOW must be yes or no.")
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError("If set, {} must be yes or no.".format(key))
+    return _value
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 
 
 def slow(test_case):
@@ -33,6 +38,19 @@ def slow(test_case):
     return test_case
 
 
+def custom_tokenizers(test_case):
+    """
+    Decorator marking a test for a custom tokenizer.
+
+    Custom tokenizers require additional dependencies, and are skipped
+    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
+    to a truthy value to run them.
+    """
+    if not _run_custom_tokenizers:
+        test_case = unittest.skip("test of custom tokenizers")(test_case)
+    return test_case
+
+
 def require_torch(test_case):
     """
     Decorator marking a test that requires PyTorch.
@@ -59,6 +77,6 @@ def require_tf(test_case):
 
 if _torch_available:
     # Set the USE_CUDA environment variable to select a GPU.
-    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+    torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
 else:
     torch_device = None

From 95854c4a2f8d418a14e64b4edf64fc7363b1ff10 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:46:00 +0000
Subject: [PATCH 85/91] Actually run the tests

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7ca5f8121c..d8f624a0e5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -79,7 +79,7 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest
             - run: sudo pip install mecab-python3
-            - run: python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
+            - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:
         working_directory: ~/transformers
         docker:

From 9cb97c0c0f7215971bb5a39cd070e5bd89319bdf Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:48:56 +0000
Subject: [PATCH 86/91] Actually run the tests

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d8f624a0e5..9d6e02d580 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -101,6 +101,7 @@ workflows:
     version: 2
     build_and_test:
         jobs:
+            - build_py3_custom_tokenizers
             - build_py3_torch_and_tf
             - build_py3_torch
             - build_py3_tf

From 5505cf701477762cedf792e20344d29bc8bf6325 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:53:44 +0000
Subject: [PATCH 87/91] Run tests on Py2 too, for Lysandre

---
 .circleci/config.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9d6e02d580..afc6d5ce44 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,6 +80,16 @@ jobs:
             - run: sudo pip install pytest
             - run: sudo pip install mecab-python3
             - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
+    build_py2_custom_tokenizers:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest
+            - run: sudo pip install mecab-python
+            - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:
         working_directory: ~/transformers
         docker:
@@ -102,6 +112,7 @@ workflows:
     build_and_test:
         jobs:
             - build_py3_custom_tokenizers
+            - build_py2_custom_tokenizers
             - build_py3_torch_and_tf
             - build_py3_torch
             - build_py3_tf

From 371c5ddfad96689771465aff557152322190b60e Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:55:43 +0000
Subject: [PATCH 88/91] Py2 tests for Lysandre

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index afc6d5ce44..c827a81fbb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -88,6 +88,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
             - run: sudo pip install mecab-python
             - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:

From 36fc52a3b4b50885d5ec3bf259f81740e19d8b3c Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 22:03:35 +0000
Subject: [PATCH 89/91] Update links to weights

---
 transformers/configuration_bert.py         | 8 ++++----
 transformers/modeling_bert.py              | 8 ++++----
 transformers/modeling_tf_bert.py           | 8 ++++----
 transformers/tokenization_bert_japanese.py | 8 ++++----
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 16f1f60404..01fcd88cb8 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -42,10 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json",
-    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json",
-    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json",
-    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json"
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index e2e115a015..d0f35272ac 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -48,10 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin",
-    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin",
-    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
 }
 
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 27dd311a4d..7cc71f5063 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -48,10 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5",
-    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5",
-    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5",
-    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5"
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
 }
 
 
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 1ce0e1d1cb..0ff45cbfe7 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -33,10 +33,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-        'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt",
-        'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt",
-        'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt",
-        'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt"
+        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
     }
 }
 

From 1748fdf657ed804f3edc1e45077b703cd8d6e4c5 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 23:31:23 +0000
Subject: [PATCH 90/91] [doc] Fix rst table

---
 docs/source/pretrained_models.rst | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index d3498e057d..775772e896 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -63,22 +63,22 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                 |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                        |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                          |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                 |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |

From 413f41921b650418798f7d5c246316c4e1e5eb5d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 12 Dec 2019 07:34:42 +0100
Subject: [PATCH 91/91] fix merge

---
 transformers/tests/utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index daed431995..c950ad8f17 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -9,14 +9,6 @@ from transformers.file_utils import _tf_available, _torch_available
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
 
 
-try:
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
-
 def parse_flag_from_env(key, default=False):
     try:
         value = os.environ[key]