From 3bd55199cdd9752be5fbd604d194d4f6ca923630 Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Thu, 9 Jul 2020 15:11:40 +0200 Subject: [PATCH] QA pipeline BART compatible (#5496) * Ensure padding and question cannot have higher probs than context. Signed-off-by: Morgan Funtowicz * Add bart the the list of tokenizers adding two tokens for squad_convert_example_to_feature Signed-off-by: Morgan Funtowicz * Format. Signed-off-by: Morgan Funtowicz * Addressing @patrickvonplaten comments. Signed-off-by: Morgan Funtowicz * Addressing @patrickvonplaten comments about masking non-context element when generating the answer. Signed-off-by: Morgan Funtowicz * Addressing @sshleifer comments. Signed-off-by: Morgan Funtowicz * Make sure we mask CLS after handling impossible answers Signed-off-by: Morgan Funtowicz * Mask in the correct vectors ... Signed-off-by: Morgan Funtowicz --- src/transformers/data/processors/squad.py | 10 +++++++++- src/transformers/pipelines.py | 18 +++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 1c840639ca..12d96bae2b 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -12,6 +12,10 @@ from ...tokenization_bert import whitespace_tokenize from .utils import DataProcessor +# Store the tokenizers which insert 2 separators tokens +MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart"} + + if is_torch_available(): import torch from torch.utils.data import TensorDataset @@ -123,9 +127,13 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q truncated_query = tokenizer.encode( example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length ) + + # Tokenizers who insert 2 SEP tokens in-between & need to have special handling + # in the way they compute mask of added tokens. + tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower() sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 - if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) + if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET else tokenizer.max_len - tokenizer.max_len_single_sentence ) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 7b7a56bfbd..f4302ff087 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1297,14 +1297,15 @@ class QuestionAnsweringPipeline(Pipeline): min_null_score = 1000000 # large and positive answers = [] for (feature, start_, end_) in zip(features, start, end): - # Mask padding and question - start_, end_ = ( - start_ * np.abs(np.array(feature.p_mask) - 1), - end_ * np.abs(np.array(feature.p_mask) - 1), - ) + # Ensure padded tokens & question tokens cannot belong to the set of candidate answers. + undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask - # Mask CLS - start_[0] = end_[0] = 0 + # Generate mask + undesired_tokens_mask = undesired_tokens == 0.0 + + # Make sure non-context indexes in the tensor cannot contribute to the softmax + start_ = np.where(undesired_tokens_mask, -10000.0, start_) + end_ = np.where(undesired_tokens_mask, -10000.0, end_) # Normalize logits and spans to retrieve the answer start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True))) @@ -1313,6 +1314,9 @@ class QuestionAnsweringPipeline(Pipeline): if kwargs["handle_impossible_answer"]: min_null_score = min(min_null_score, (start_[0] * end_[0]).item()) + # Mask CLS + start_[0] = end_[0] = 0.0 + starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) char_to_word = np.array(example.char_to_word_offset)