From edf0582c0be87b60f94f41c659ea779876efc7be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=B6hm?= Date: Fri, 17 Apr 2020 17:14:12 +0200 Subject: [PATCH] Fix token_type_id in BERT question-answering example (#3790) token_type_id is converted into the segment embedding. For question answering, this needs to highlight whether a token belongs to sequence 0 or 1. encode_plus takes care of correctly setting this parameter automatically. --- src/transformers/modeling_bert.py | 4 ++-- src/transformers/modeling_tf_bert.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 1930c31d9d..3cfa13acdd 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -1406,8 +1406,8 @@ class BertForQuestionAnswering(BertPreTrainedModel): model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - input_ids = tokenizer.encode(question, text) - token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] + encoding = tokenizer.encode_plus(question, text) + input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 9ad828ee78..32082f77b1 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -1148,10 +1148,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): from transformers import BertTokenizer, TFBertForQuestionAnswering tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 - outputs = model(input_ids) - start_scores, end_scores = outputs[:2] + model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + + question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + encoding = tokenizer.encode_plus(question, text) + input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] + start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :]) + + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1]) + assert answer == "a nice puppet" """ outputs = self.bert(inputs, **kwargs)