From 3bd55199cdd9752be5fbd604d194d4f6ca923630 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 9 Jul 2020 15:11:40 +0200
Subject: [PATCH] QA pipeline BART compatible (#5496)

* Ensure padding and question cannot have higher probs than context.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Add bart the the list of tokenizers adding two <sep> tokens for squad_convert_example_to_feature

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Format.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Addressing @patrickvonplaten comments.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Addressing @patrickvonplaten comments about masking non-context element when generating the answer.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Addressing @sshleifer comments.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Make sure we mask CLS after handling impossible answers

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Mask in the correct vectors ...

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>
---
 src/transformers/data/processors/squad.py | 10 +++++++++-
 src/transformers/pipelines.py             | 18 +++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 1c840639ca..12d96bae2b 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -12,6 +12,10 @@ from ...tokenization_bert import whitespace_tokenize
 from .utils import DataProcessor
 
 
+# Store the tokenizers which insert 2 separators tokens
+MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart"}
+
+
 if is_torch_available():
     import torch
     from torch.utils.data import TensorDataset
@@ -123,9 +127,13 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
     truncated_query = tokenizer.encode(
         example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
     )
+
+    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
+    # in the way they compute mask of added tokens.
+    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
     sequence_added_tokens = (
         tokenizer.max_len - tokenizer.max_len_single_sentence + 1
-        if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
+        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
         else tokenizer.max_len - tokenizer.max_len_single_sentence
     )
     sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 7b7a56bfbd..f4302ff087 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1297,14 +1297,15 @@ class QuestionAnsweringPipeline(Pipeline):
             min_null_score = 1000000  # large and positive
             answers = []
             for (feature, start_, end_) in zip(features, start, end):
-                # Mask padding and question
-                start_, end_ = (
-                    start_ * np.abs(np.array(feature.p_mask) - 1),
-                    end_ * np.abs(np.array(feature.p_mask) - 1),
-                )
+                # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+                undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
 
-                # Mask CLS
-                start_[0] = end_[0] = 0
+                # Generate mask
+                undesired_tokens_mask = undesired_tokens == 0.0
+
+                # Make sure non-context indexes in the tensor cannot contribute to the softmax
+                start_ = np.where(undesired_tokens_mask, -10000.0, start_)
+                end_ = np.where(undesired_tokens_mask, -10000.0, end_)
 
                 # Normalize logits and spans to retrieve the answer
                 start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
@@ -1313,6 +1314,9 @@ class QuestionAnsweringPipeline(Pipeline):
                 if kwargs["handle_impossible_answer"]:
                     min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
 
+                # Mask CLS
+                start_[0] = end_[0] = 0.0
+
                 starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                 char_to_word = np.array(example.char_to_word_offset)