QA pipeline BART compatible (#5496)
* Ensure padding and question cannot have higher probs than context. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Add bart the the list of tokenizers adding two <sep> tokens for squad_convert_example_to_feature Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Format. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Addressing @patrickvonplaten comments. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Addressing @patrickvonplaten comments about masking non-context element when generating the answer. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Addressing @sshleifer comments. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Make sure we mask CLS after handling impossible answers Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Mask in the correct vectors ... Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>
This commit is contained in:
@@ -12,6 +12,10 @@ from ...tokenization_bert import whitespace_tokenize
|
|||||||
from .utils import DataProcessor
|
from .utils import DataProcessor
|
||||||
|
|
||||||
|
|
||||||
|
# Store the tokenizers which insert 2 separators tokens
|
||||||
|
MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart"}
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import TensorDataset
|
from torch.utils.data import TensorDataset
|
||||||
@@ -123,9 +127,13 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
|
|||||||
truncated_query = tokenizer.encode(
|
truncated_query = tokenizer.encode(
|
||||||
example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
|
example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
|
||||||
|
# in the way they compute mask of added tokens.
|
||||||
|
tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
|
||||||
sequence_added_tokens = (
|
sequence_added_tokens = (
|
||||||
tokenizer.max_len - tokenizer.max_len_single_sentence + 1
|
tokenizer.max_len - tokenizer.max_len_single_sentence + 1
|
||||||
if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
|
if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
|
||||||
else tokenizer.max_len - tokenizer.max_len_single_sentence
|
else tokenizer.max_len - tokenizer.max_len_single_sentence
|
||||||
)
|
)
|
||||||
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
|
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
|
||||||
|
|||||||
@@ -1297,14 +1297,15 @@ class QuestionAnsweringPipeline(Pipeline):
|
|||||||
min_null_score = 1000000 # large and positive
|
min_null_score = 1000000 # large and positive
|
||||||
answers = []
|
answers = []
|
||||||
for (feature, start_, end_) in zip(features, start, end):
|
for (feature, start_, end_) in zip(features, start, end):
|
||||||
# Mask padding and question
|
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
|
||||||
start_, end_ = (
|
undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
|
||||||
start_ * np.abs(np.array(feature.p_mask) - 1),
|
|
||||||
end_ * np.abs(np.array(feature.p_mask) - 1),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mask CLS
|
# Generate mask
|
||||||
start_[0] = end_[0] = 0
|
undesired_tokens_mask = undesired_tokens == 0.0
|
||||||
|
|
||||||
|
# Make sure non-context indexes in the tensor cannot contribute to the softmax
|
||||||
|
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
|
||||||
|
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
|
||||||
|
|
||||||
# Normalize logits and spans to retrieve the answer
|
# Normalize logits and spans to retrieve the answer
|
||||||
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
|
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
|
||||||
@@ -1313,6 +1314,9 @@ class QuestionAnsweringPipeline(Pipeline):
|
|||||||
if kwargs["handle_impossible_answer"]:
|
if kwargs["handle_impossible_answer"]:
|
||||||
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
|
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
|
||||||
|
|
||||||
|
# Mask CLS
|
||||||
|
start_[0] = end_[0] = 0.0
|
||||||
|
|
||||||
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
|
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
|
||||||
char_to_word = np.array(example.char_to_word_offset)
|
char_to_word = np.array(example.char_to_word_offset)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user