From 9ecd83dace3961eaa161405814b00ea595c86451 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 5 Dec 2019 14:44:57 -0500
Subject: [PATCH] Patch evaluation for impossible values + cleanup

---
 docs/source/main_classes/processors.rst |  4 ++--
 examples/run_squad.py                   | 25 +++++--------------------
 transformers/data/processors/squad.py   |  6 +++---
 transformers/tokenization_utils.py      |  2 +-
 4 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst
index ce0eeb553a..e98910ae1b 100644
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -55,7 +55,7 @@ Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 An example using these processors is given in the
-`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+`run_glue.py <https://github.com/huggingface/transformers/blob/master/examples/run_glue.py>`__ script.
 
 
 
@@ -132,4 +132,4 @@ Example::
 
 
 Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
+`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 3f1b6a798f..5caff9ae4f 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -311,7 +311,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         str(args.max_seq_length)))
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features_and_dataset = torch.load(cached_features_file)
+        features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
     else:
         logger.info("Creating features from dataset file at %s", input_dir)
 
@@ -330,40 +331,24 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
             examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
 
-        features = squad_convert_examples_to_features( 
+        features, dataset = squad_convert_examples_to_features( 
             examples=examples,
             tokenizer=tokenizer,
             max_seq_length=args.max_seq_length,
             doc_stride=args.doc_stride,
             max_query_length=args.max_query_length,
             is_training=not evaluate,
+            return_dataset='pt'
         )
 
 
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)
 
     if args.local_rank == 0 and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-    if evaluate:
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
-    else:
-        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
-
     if output_examples:
         return dataset, examples, features
     return dataset
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 338bae0c51..bb56aa792f 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -312,7 +312,7 @@ class SquadProcessor(DataProcessor):
         if not evaluate:
             answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
             answer_start = tensor_dict['answers']['answer_start'][0].numpy()
-            answers = None
+            answers = []
         else:
             answers = [{
                 "answer_start": start.numpy(), 
@@ -408,7 +408,7 @@ class SquadProcessor(DataProcessor):
                     question_text = qa["question"]
                     start_position_character = None
                     answer_text = None
-                    answers = None
+                    answers = []
                     
                     if "is_impossible" in qa:
                         is_impossible = qa["is_impossible"]
@@ -469,7 +469,7 @@ class SquadExample(object):
                  answer_text,
                  start_position_character,
                  title,
-                 answers=None,
+                 answers=[],
                  is_impossible=False):
         self.qas_id = qas_id
         self.question_text = question_text
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 41a611ea49..5ec173bbf6 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -194,7 +194,7 @@ class PreTrainedTokenizer(object):
 
     @property
     def pad_token_type_id(self):
-        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+        """ Id of the padding token type in the vocabulary."""
         return self._pad_token_type_id
 
     @property