From df52abe3733484bf62069e819680909e349bac72 Mon Sep 17 00:00:00 2001 From: erenup Date: Wed, 28 Aug 2019 16:36:21 +0800 Subject: [PATCH] add sep_toekn between question and choice --- .../utils_multiple_choice.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/single_model_scripts/utils_multiple_choice.py b/examples/single_model_scripts/utils_multiple_choice.py index 34505195ed..6a9fa0e64f 100644 --- a/examples/single_model_scripts/utils_multiple_choice.py +++ b/examples/single_model_scripts/utils_multiple_choice.py @@ -329,7 +329,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, if example.question.find("_") != -1: tokens_b = tokenizer.tokenize(example.question.replace("_", ending)) else: - tokens_b = tokenizer.tokenize(example.question + " " + ending) + tokens_b = tokenizer.tokenize(example.question) + tokens_b += [sep_token] + if sep_token_extra: + tokens_b += [sep_token] + tokens_b += tokenizer.tokenize(ending) + special_tokens_count = 4 if sep_token_extra else 3 _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) @@ -425,10 +430,11 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() + # if len(tokens_a) > len(tokens_b): + # tokens_a.pop() + # else: + # tokens_b.pop() + tokens_a.pop() processors = {