From ea52f82455a7ca0f979768204dfeb38b5fff13ad Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 18 Nov 2019 14:42:59 -0500 Subject: [PATCH 01/26] Moved some SQuAD logic to /data --- transformers/__init__.py | 3 +- transformers/data/__init__.py | 3 +- transformers/data/processors/__init__.py | 1 + transformers/data/processors/squad.py | 318 +++++++++++++++++++++++ 4 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 transformers/data/processors/squad.py diff --git a/transformers/__init__.py b/transformers/__init__.py index 5c7b0a6197..b859e18c53 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -25,7 +25,8 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH from .data import (is_sklearn_available, InputExample, InputFeatures, DataProcessor, glue_output_modes, glue_convert_examples_to_features, - glue_processors, glue_tasks_num_labels) + glue_processors, glue_tasks_num_labels, + squad_convert_examples_to_features, SquadFeatures) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index e910d6da2e..827d96ed29 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,5 +1,6 @@ -from .processors import InputExample, InputFeatures, DataProcessor +from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features +from .processors import squad_convert_examples_to_features from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index af38c54beb..4e322a2ca8 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,3 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features +from .squad import squad_convert_examples_to_features, SquadFeatures diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py new file mode 100644 index 0000000000..c1a1034f17 --- /dev/null +++ b/transformers/data/processors/squad.py @@ -0,0 +1,318 @@ +from tqdm import tqdm +import collections +import logging +import os + +from .utils import DataProcessor, InputExample, InputFeatures +from ...file_utils import is_tf_available + +if is_tf_available(): + import tensorflow as tf + +logger = logging.getLogger(__name__) + +def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + cls_token_at_end=False, + cls_token='[CLS]', sep_token='[SEP]', pad_token=0, + sequence_a_segment_id=0, sequence_b_segment_id=1, + cls_token_segment_id=0, pad_token_segment_id=0, + mask_padding_with_zero=True, + sequence_a_is_doc=False): + """Loads a data file into a list of `InputBatch`s.""" + + # Defining helper methods + def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + unique_id = 1000000000 + + features = [] + for (example_index, example) in enumerate(tqdm(examples)): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = [] + + # CLS token at the beginning + if not cls_token_at_end: + tokens.append(cls_token) + segment_ids.append(cls_token_segment_id) + p_mask.append(0) + cls_index = 0 + + # XLNet: P SEP Q SEP CLS + # Others: CLS Q SEP P SEP + if not sequence_a_is_doc: + # Query + tokens += query_tokens + segment_ids += [sequence_a_segment_id] * len(query_tokens) + p_mask += [1] * len(query_tokens) + + # SEP token + tokens.append(sep_token) + segment_ids.append(sequence_a_segment_id) + p_mask.append(1) + + # Paragraph + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + if not sequence_a_is_doc: + segment_ids.append(sequence_b_segment_id) + else: + segment_ids.append(sequence_a_segment_id) + p_mask.append(0) + paragraph_len = doc_span.length + + if sequence_a_is_doc: + # SEP token + tokens.append(sep_token) + segment_ids.append(sequence_a_segment_id) + p_mask.append(1) + + tokens += query_tokens + segment_ids += [sequence_b_segment_id] * len(query_tokens) + p_mask += [1] * len(query_tokens) + + # SEP token + tokens.append(sep_token) + segment_ids.append(sequence_b_segment_id) + p_mask.append(1) + + # CLS token at the end + if cls_token_at_end: + tokens.append(cls_token) + segment_ids.append(cls_token_segment_id) + p_mask.append(0) + cls_index = len(tokens) - 1 # Index of classification token + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(pad_token) + input_mask.append(0 if mask_padding_with_zero else 1) + segment_ids.append(pad_token_segment_id) + p_mask.append(1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + span_is_impossible = example.is_impossible + start_position = None + end_position = None + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + span_is_impossible = True + else: + if sequence_a_is_doc: + doc_offset = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and span_is_impossible: + start_position = cls_index + end_position = cls_index + + if example_index < 20: + logger.info("*** Example ***") + logger.info("unique_id: %s" % (unique_id)) + logger.info("example_index: %s" % (example_index)) + logger.info("doc_span_index: %s" % (doc_span_index)) + logger.info("tokens: %s" % " ".join(tokens)) + logger.info("token_to_orig_map: %s" % " ".join([ + "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) + logger.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() + ])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and span_is_impossible: + logger.info("impossible example") + if is_training and not span_is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + logger.info("start_position: %d" % (start_position)) + logger.info("end_position: %d" % (end_position)) + logger.info( + "answer: %s" % (answer_text)) + + features.append( + SquadFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + cls_index=cls_index, + p_mask=p_mask, + paragraph_len=paragraph_len, + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible)) + unique_id += 1 + + return features + +class SquadFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + cls_index, + p_mask, + paragraph_len, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.cls_index = cls_index + self.p_mask = p_mask + self.paragraph_len = paragraph_len + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __eq__(self, other): + return self.cls_index == other.cls_index and \ + self.doc_span_index == other.doc_span_index and \ + self.end_position == other.end_position and \ + self.example_index == other.example_index and \ + self.input_ids == other.input_ids and \ + self.input_mask == other.input_mask and \ + self.is_impossible == other.is_impossible and \ + self.p_mask == other.p_mask and \ + self.paragraph_len == other.paragraph_len and \ + self.segment_ids == other.segment_ids and \ + self.start_position == other.start_position and \ + self.token_is_max_context == other.token_is_max_context and \ + self.token_to_orig_map == other.token_to_orig_map and \ + self.tokens == other.tokens and \ + self.unique_id == other.unique_id \ No newline at end of file From 72e506b22e90feab6c410136bacc27f3d65284b9 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 19 Nov 2019 09:49:55 -0500 Subject: [PATCH 02/26] wip --- examples/run_squad.py | 29 +++++- transformers/__init__.py | 3 +- transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- transformers/data/processors/squad.py | 122 +++++++++++++++++++++++ transformers/tokenization_utils.py | 4 + 6 files changed, 157 insertions(+), 5 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 69088d73c3..d4219c3096 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -23,7 +23,6 @@ import os import random import glob import timeit - import numpy as np import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, @@ -45,7 +44,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, get_linear_schedule_with_warmup +from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples from utils_squad import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions, @@ -309,6 +308,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) + + examples = examples[:10] features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, @@ -319,6 +320,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, cls_token_at_end=True if args.model_type in ['xlnet'] else False, sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) + + exampless = sread_squad_examples(input_file=input_file, + is_training=not evaluate, + version_2_with_negative=args.version_2_with_negative) + exampless = exampless[:10] + features2 = squad_convert_examples_to_features(examples=exampless, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, + pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, + cls_token_at_end=True if args.model_type in ['xlnet'] else False, + sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) + + print(features2) + + for i in range(len(features)): + assert features[i] == features2[i] + print("Equal") + + print("DONE") + if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/transformers/__init__.py b/transformers/__init__.py index b859e18c53..9a767913b3 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -26,7 +26,8 @@ from .data import (is_sklearn_available, InputExample, InputFeatures, DataProcessor, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, - squad_convert_examples_to_features, SquadFeatures) + squad_convert_examples_to_features, SquadFeatures, + SquadExample, read_squad_examples) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index 827d96ed29..50f2e768f4 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,6 +1,6 @@ from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .processors import squad_convert_examples_to_features +from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 4e322a2ca8..924b4a1245 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .squad import squad_convert_examples_to_features, SquadFeatures +from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index c1a1034f17..1900e9f0ce 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -2,7 +2,9 @@ from tqdm import tqdm import collections import logging import os +import json +from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures from ...file_utils import is_tf_available @@ -11,6 +13,7 @@ if is_tf_available(): logger = logging.getLogger(__name__) + def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, cls_token_at_end=False, @@ -265,6 +268,125 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, return features + +def read_squad_examples(input_file, is_training, version_2_with_negative): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + if version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + return examples + + +class SquadExample(object): + """ + A single training/test example for the Squad dataset. + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=None): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (self.qas_id) + s += ", question_text: %s" % ( + self.question_text) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.end_position: + s += ", end_position: %d" % (self.end_position) + if self.is_impossible: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + class SquadFeatures(object): """A single set of features of data.""" diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 4fa26a26f8..ba10e6b311 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -605,6 +605,10 @@ class PreTrainedTokenizer(object): vocabularies (BPE/SentencePieces/WordPieces). Take care of added tokens. + + text: The sequence to be encoded. + return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False). + **kwargs: passed to the child `self.tokenize()` method """ def split_on_token(tok, text): result = [] From 9f374c8252330bffd669c43749b5e937ed31d90a Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Fri, 22 Nov 2019 16:27:15 -0500 Subject: [PATCH 03/26] `encode` and `encode_plus` handle attention masks and padding --- .../tests/tokenization_tests_commons.py | 51 ++++++++++++ transformers/tokenization_utils.py | 77 ++++++++++++++++++- transformers/tokenization_xlnet.py | 1 + 3 files changed, 127 insertions(+), 2 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index fdaf8cc137..d5b70d5266 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -335,3 +335,54 @@ class CommonTestCases: special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True) self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) self.assertEqual(special_tokens_mask_orig, special_tokens_mask) + + def test_padding_to_max_length(self): + tokenizer = self.get_tokenizer() + + sequence = "Sequence" + padding_size = 10 + padding_idx = tokenizer.pad_token_id + + # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode(sequence, pad_to_max_length=True) + padded_sequence_length = len(padded_sequence) + assert sequence_length == padded_sequence_length + assert encoded_sequence == padded_sequence + + def test_encode_plus_with_padding(self): + tokenizer = self.get_tokenizer() + + sequence = "Sequence" + padding_size = 10 + padding_idx = tokenizer.pad_token_id + token_type_padding_idx = tokenizer.pad_token_type_id + + encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) + input_ids = encoded_sequence['input_ids'] + token_type_ids = encoded_sequence['token_type_ids'] + attention_mask = encoded_sequence['attention_mask'] + special_tokens_mask = encoded_sequence['special_tokens_mask'] + sequence_length = len(input_ids) + + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) + padded_input_ids = padded_sequence['input_ids'] + padded_token_type_ids = padded_sequence['token_type_ids'] + padded_attention_mask = padded_sequence['attention_mask'] + padded_special_tokens_mask = padded_sequence['special_tokens_mask'] + padded_sequence_length = len(padded_input_ids) + + assert sequence_length + padding_size == padded_sequence_length + assert input_ids + [padding_idx] * padding_size == padded_input_ids + assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids + assert attention_mask + [0] * padding_size == padded_attention_mask + assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask \ No newline at end of file diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index ba10e6b311..3214699e12 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -190,6 +190,11 @@ class PreTrainedTokenizer(object): """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.pad_token) + @property + def pad_token_type_id(self): + """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + return self._pad_token_type_id + @property def cls_token_id(self): """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ @@ -213,6 +218,7 @@ class PreTrainedTokenizer(object): self._pad_token = None self._cls_token = None self._mask_token = None + self._pad_token_type_id = 0 self._additional_special_tokens = [] self.max_len = max_len if max_len is not None else int(1e12) @@ -696,6 +702,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', + pad_to_max_length=False, return_tensors=None, **kwargs): """ @@ -722,6 +729,8 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding index, up to their max length. If no max length is specified, no padding is done. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -732,6 +741,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, + pad_to_max_length=pad_to_max_length, return_tensors=return_tensors, **kwargs) @@ -744,7 +754,12 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', + pad_to_max_length=False, return_tensors=None, + return_token_type_ids=True, + return_attention_mask=True, + return_overflowing_tokens=False, + return_special_tokens_mask=False, **kwargs): """ Returns a dictionary containing the encoded sequence or sequence pair and additional informations: @@ -769,9 +784,37 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding index, up to their max length. If no max length is specified, no padding is done. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. + return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). + return_attention_mask: (optional) Set to False to avoir returning attention mask (default True) + return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). + return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + attention_mask: list[int] if return_attention_mask is True (default) + overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + ``input_ids``: list of token ids to be fed to a model + ``token_type_ids``: list of token type ids to be fed to a model + ``attention_mask``: list of indices specifying which tokens should be attended to by the model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. """ def get_input_ids(text): @@ -790,13 +833,24 @@ class PreTrainedTokenizer(object): return self.prepare_for_model(first_ids, pair_ids=second_ids, max_length=max_length, + pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, - return_tensors=return_tensors) + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, - truncation_strategy='longest_first', return_tensors=None): + truncation_strategy='longest_first', + pad_to_max_length=False, + return_tensors=None, + return_token_type_ids=True, + return_attention_mask=True, + return_overflowing_tokens=False, + return_special_tokens_mask=False): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -819,8 +873,14 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding index, up to their max length. If no max length is specified, no padding is done. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. + return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). + return_attention_mask: (optional) Set to False to avoir returning attention mask (default True) + return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). + return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). Return: A Dictionary of shape:: @@ -883,6 +943,19 @@ class PreTrainedTokenizer(object): "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.max_len)) + if pad_to_max_length and max_length and len(encoded_inputs["input_ids"]) < max_length: + difference = max_length - len(encoded_inputs["input_ids"]) + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] += [self.pad_token_type_id] * difference + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] += [1] * difference + + encoded_inputs["input_ids"] += [self.pad_token_id] * difference + elif return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + return encoded_inputs def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index a4f1a6e3ba..3ea71f4438 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -74,6 +74,7 @@ class XLNetTokenizer(PreTrainedTokenizer): self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + self._pad_token_type_id = 3 try: import sentencepiece as spm From a7dafe2f41222469797f1a67232961d67bd2e519 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 21 Nov 2019 11:30:40 -0500 Subject: [PATCH 04/26] Padding strategy (left and right) rather than boolean flag --- .../tests/tokenization_tests_commons.py | 43 +++++++++++--- transformers/tokenization_utils.py | 58 ++++++++++++++----- 2 files changed, 77 insertions(+), 24 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index d5b70d5266..40d68d0ab2 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -343,21 +343,33 @@ class CommonTestCases: padding_size = 10 padding_idx = tokenizer.pad_token_id - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right') padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - # Check that nothing is done when a maximum length is not specified + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, pad_to_max_length=True) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left') padded_sequence_length = len(padded_sequence) - assert sequence_length == padded_sequence_length - assert encoded_sequence == padded_sequence + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + + # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right') + padded_sequence_right_length = len(padded_sequence_right) + padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left') + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left def test_encode_plus_with_padding(self): tokenizer = self.get_tokenizer() @@ -374,7 +386,8 @@ class CommonTestCases: special_tokens_mask = encoded_sequence['special_tokens_mask'] sequence_length = len(input_ids) - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) + # Test right padding + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True) padded_input_ids = padded_sequence['input_ids'] padded_token_type_ids = padded_sequence['token_type_ids'] padded_attention_mask = padded_sequence['attention_mask'] @@ -385,4 +398,18 @@ class CommonTestCases: assert input_ids + [padding_idx] * padding_size == padded_input_ids assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids assert attention_mask + [0] * padding_size == padded_attention_mask - assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask \ No newline at end of file + assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask + + # Test left padding + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True) + padded_input_ids = padded_sequence['input_ids'] + padded_token_type_ids = padded_sequence['token_type_ids'] + padded_attention_mask = padded_sequence['attention_mask'] + padded_special_tokens_mask = padded_sequence['special_tokens_mask'] + padded_sequence_length = len(padded_input_ids) + + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + input_ids == padded_input_ids + assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids + assert [0] * padding_size + attention_mask == padded_attention_mask + assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask \ No newline at end of file diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 3214699e12..dbbabd0e1a 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -702,7 +702,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - pad_to_max_length=False, + padding_strategy=None, return_tensors=None, **kwargs): """ @@ -729,8 +729,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's padding index, up to their max length. If no max length is specified, no padding is done. + The strategies are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to None: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -741,7 +745,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, - pad_to_max_length=pad_to_max_length, + padding_strategy=padding_strategy, return_tensors=return_tensors, **kwargs) @@ -754,7 +758,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - pad_to_max_length=False, + padding_strategy=None, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -784,8 +788,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's padding index, up to their max length. If no max length is specified, no padding is done. + The strategies are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to None: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -833,7 +841,7 @@ class PreTrainedTokenizer(object): return self.prepare_for_model(first_ids, pair_ids=second_ids, max_length=max_length, - pad_to_max_length=pad_to_max_length, + padding_strategy=padding_strategy, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, @@ -845,7 +853,7 @@ class PreTrainedTokenizer(object): def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, truncation_strategy='longest_first', - pad_to_max_length=False, + padding_strategy=None, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -873,8 +881,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's padding index, up to their max length. If no max length is specified, no padding is done. + The strategies are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to None: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -943,16 +955,30 @@ class PreTrainedTokenizer(object): "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.max_len)) - if pad_to_max_length and max_length and len(encoded_inputs["input_ids"]) < max_length: + if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length: difference = max_length - len(encoded_inputs["input_ids"]) - if return_attention_mask: - encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference - if return_token_type_ids: - encoded_inputs["token_type_ids"] += [self.pad_token_type_id] * difference - if return_special_tokens_mask: - encoded_inputs["special_tokens_mask"] += [1] * difference - encoded_inputs["input_ids"] += [self.pad_token_id] * difference + if padding_strategy == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference + + elif padding_strategy == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"] + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] + + else: + raise ValueError("Invalid padding strategy:" + str(padding_strategy)) + elif return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) From a5a8a6175fb5cc1e993366add026ba06386bde10 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 21 Nov 2019 19:18:20 -0500 Subject: [PATCH 05/26] Works for BERT --- transformers/data/processors/squad.py | 507 ++++++++++++++++++++++---- 1 file changed, 432 insertions(+), 75 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 1900e9f0ce..a0f2408a16 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -3,6 +3,7 @@ import collections import logging import os import json +import numpy as np from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures @@ -13,10 +14,68 @@ if is_tf_available(): logger = logging.getLogger(__name__) +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _new_check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + # if len(doc_spans) == 1: + # return True + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span["start"] + doc_span["length"] - 1 + if position < doc_span["start"]: + continue + if position > end: + continue + num_left_context = position - doc_span["start"] + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"] + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + +def _is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, - cls_token_at_end=False, + cls_token_at_end=True, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=0, pad_token_segment_id=0, @@ -24,57 +83,184 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_a_is_doc=False): """Loads a data file into a list of `InputBatch`s.""" - # Defining helper methods - def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): - """Returns tokenized answer spans that better match the annotated answer.""" - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - + # Defining helper methods unique_id = 1000000000 features = [] + new_features = [] for (example_index, example) in enumerate(tqdm(examples)): - query_tokens = tokenizer.tokenize(example.question_text) - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + # Split on whitespace so that different tokens may be attributed to their original position. + for c in example.context_text: + if _is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + if is_training: + # Get start and end position + answer_length = len(example.answer_text) + start_position = char_to_word_offset[example.start_position] + end_position = char_to_word_offset[example.start_position + answer_length - 1] + + # If the answer cannot be found in the text, then skip this example. + actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) + continue tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): + for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) + spans = [] + + truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair + + encoded_dict = tokenizer.encode_plus( + truncated_query, + all_doc_tokens, + max_length=max_seq_length, + padding_strategy='right', + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + return_overflowing_tokens=True, + truncation_strategy='only_second' + ) + + ids = encoded_dict['input_ids'] + print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids + paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[0 + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = 0 + encoded_dict["length"] = paragraph_len + + spans.append(encoded_dict) + print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) + while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict: + + overflowing_tokens = encoded_dict['overflowing_tokens'] + + print("OVERFLOW", len(overflowing_tokens)) + + encoded_dict = tokenizer.encode_plus( + truncated_query, + overflowing_tokens, + max_length=max_seq_length, + return_overflowing_tokens=True, + padding_strategy='right', + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + truncation_strategy='only_second' + ) + + ids = encoded_dict['input_ids'] + print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + + # Length of the document without the query + paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + + non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[len(spans) * doc_stride + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = len(spans) * doc_stride + encoded_dict["length"] = paragraph_len + + # split_token_index = doc_span.start + i + # token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + # is_max_context = _check_is_max_context(doc_spans, doc_span_index, + # split_token_index) + # token_is_max_context[len(tokens)] = is_max_context + # tokens.append(all_doc_tokens[split_token_index]) + + spans.append(encoded_dict) + + for doc_span_index in range(len(spans)): + for j in range(spans[doc_span_index]["paragraph_len"]): + is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) + index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + spans[doc_span_index]["token_is_max_context"][index] = is_max_context + + print("new span", len(spans)) + for span in spans: + # Identify the position of the CLS token + cls_index = span['input_ids'].index(tokenizer.cls_token_id) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = np.array(span['token_type_ids']) + + # Convert all SEP indices to '0' before inversion + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 0 + + # Limit positive values to one + p_mask = 1 - np.minimum(p_mask, 1) + + # Set the CLS index to '0' + p_mask[cls_index] = 0 + + print("new features length", len(new_features)) + + new_features.append(NewSquadFeatures( + span['input_ids'], + span['attention_mask'], + span['token_type_ids'], + cls_index, + p_mask.tolist(), + + example_index=example_index, + unique_id=unique_id, + paragraph_len=span['paragraph_len'], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"] + )) + + unique_id += 1 + + # tokenize ... + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + tok_start_position = None tok_end_position = None if is_training and example.is_impossible: @@ -82,7 +268,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: + if example.end_position < len(doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 @@ -101,14 +287,19 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): + print("OLD DOC CREATION BEGIN", start_offset, len(all_doc_tokens)) length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): + print("Done with this doc span, breaking out.", start_offset, length) break + print("CHOOSING OFFSET", length, doc_stride) start_offset += min(length, doc_stride) + print("OLD DOC CREATION END", start_offset) + print("old span", len(doc_spans)) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} @@ -183,18 +374,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(pad_token) input_mask.append(0 if mask_padding_with_zero else 1) segment_ids.append(pad_token_segment_id) p_mask.append(1) - + print("[OLD] Ids computed; position of the first padding", input_ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in input_ids else None) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length - span_is_impossible = example.is_impossible + span_is_impossible = example.is_impossible if hasattr(example, "is_impossible") else False start_position = None end_position = None if is_training and not span_is_impossible: @@ -222,31 +415,32 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, start_position = cls_index end_position = cls_index - if example_index < 20: - logger.info("*** Example ***") - logger.info("unique_id: %s" % (unique_id)) - logger.info("example_index: %s" % (example_index)) - logger.info("doc_span_index: %s" % (doc_span_index)) - logger.info("tokens: %s" % " ".join(tokens)) - logger.info("token_to_orig_map: %s" % " ".join([ - "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - logger.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - ])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - if is_training and span_is_impossible: - logger.info("impossible example") - if is_training and not span_is_impossible: - answer_text = " ".join(tokens[start_position:(end_position + 1)]) - logger.info("start_position: %d" % (start_position)) - logger.info("end_position: %d" % (end_position)) - logger.info( - "answer: %s" % (answer_text)) + # if example_index < 20: + # logger.info("*** Example ***") + # logger.info("unique_id: %s" % (unique_id)) + # logger.info("example_index: %s" % (example_index)) + # logger.info("doc_span_index: %s" % (doc_span_index)) + # logger.info("tokens: %s" % str(tokens)) + # logger.info("token_to_orig_map: %s" % " ".join([ + # "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) + # logger.info("token_is_max_context: %s" % " ".join([ + # "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() + # ])) + # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + # logger.info( + # "input_mask: %s" % " ".join([str(x) for x in input_mask])) + # logger.info( + # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + # if is_training and span_is_impossible: + # logger.info("impossible example") + # if is_training and not span_is_impossible: + # answer_text = " ".join(tokens[start_position:(end_position + 1)]) + # logger.info("start_position: %d" % (start_position)) + # logger.info("end_position: %d" % (end_position)) + # logger.info( + # "answer: %s" % (answer_text)) + print("features length", len(features)) features.append( SquadFeatures( unique_id=unique_id, @@ -266,7 +460,48 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, is_impossible=span_is_impossible)) unique_id += 1 - return features + assert len(features) == len(new_features) + + assert len(features) == len(new_features) + for i in range(len(features)): + print(i) + feature, new_feature = features[i], new_features[i] + + input_ids = feature.input_ids + input_mask = feature.input_mask + segment_ids = feature.segment_ids + cls_index = feature.cls_index + p_mask = feature.p_mask + example_index = feature.example_index + paragraph_len = feature.paragraph_len + token_is_max_context = feature.token_is_max_context + tokens = feature.tokens + token_to_orig_map = feature.token_to_orig_map + + new_input_ids = new_feature.input_ids + new_input_mask = new_feature.attention_mask + new_segment_ids = new_feature.token_type_ids + new_cls_index = new_feature.cls_index + new_p_mask = new_feature.p_mask + new_example_index = new_feature.example_index + new_paragraph_len = new_feature.paragraph_len + new_token_is_max_context = new_feature.token_is_max_context + new_tokens = new_feature.tokens + new_token_to_orig_map = new_feature.token_to_orig_map + + assert input_ids == new_input_ids + assert input_mask == new_input_mask + assert segment_ids == new_segment_ids + assert cls_index == new_cls_index + assert p_mask == new_p_mask + assert example_index == new_example_index + assert paragraph_len == new_paragraph_len + assert token_is_max_context == new_token_is_max_context + assert tokens == new_tokens + assert token_to_orig_map == new_token_to_orig_map + + + return new_features def read_squad_examples(input_file, is_training, version_2_with_negative): @@ -347,6 +582,124 @@ def read_squad_examples(input_file, is_training, version_2_with_negative): return examples +class SquadV1Processor(DataProcessor): + """Processor for the SQuAD data set.""" + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return NewSquadExample( + tensor_dict['id'].numpy(), + tensor_dict['question'].numpy().decode('utf-8'), + tensor_dict['context'].numpy().decode('utf-8'), + tensor_dict['answers']['text'].numpy().decode('utf-8'), + tensor_dict['answers']['answers_start'].numpy().decode('utf-8'), + tensor_dict['title'].numpy().decode('utf-8') + ) + + def get_train_examples(self, data_dir): + """See base class.""" + with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, input_data, set_type): + """Creates examples for the training and dev sets.""" + + is_training = set_type == "train" + examples = [] + for entry in input_data: + title = entry['title'] + for paragraph in entry["paragraphs"]: + context_text = paragraph["context"] + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + answer_text = None + if is_training: + if (len(qa["answers"]) != 1): + raise ValueError( + "For training, each question should have exactly 1 answer.") + answer = qa["answers"][0] + answer_text = answer['text'] + start_position = answer['answer_start'] + + example = NewSquadExample( + qas_id=qas_id, + question_text=question_text, + context_text=context_text, + answer_text=answer_text, + start_position=start_position, + title=title + ) + examples.append(example) + return examples + + + +class NewSquadExample(object): + """ + A single training/test example for the Squad dataset, as loaded from disk. + """ + + def __init__(self, + qas_id, + question_text, + context_text, + answer_text, + start_position, + title): + self.qas_id = qas_id + self.question_text = question_text + self.context_text = context_text + self.answer_text = answer_text + self.start_position = start_position + self.title = title + + +class NewSquadFeatures(object): + """ + Single squad example features to be fed to a model. + Those features are model-specific. + """ + + def __init__(self, + input_ids, + attention_mask, + token_type_ids, + cls_index, + p_mask, + + example_index, + unique_id, + paragraph_len, + token_is_max_context, + tokens, + token_to_orig_map + ): + self.input_ids = input_ids + self.attention_mask = attention_mask + self.token_type_ids = token_type_ids + self.cls_index = cls_index + self.p_mask = p_mask + + self.example_index = example_index + self.unique_id = unique_id + self.paragraph_len = paragraph_len + self.token_is_max_context = token_is_max_context + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + class SquadExample(object): """ A single training/test example for the Squad dataset. @@ -423,18 +776,22 @@ class SquadFeatures(object): self.is_impossible = is_impossible def __eq__(self, other): - return self.cls_index == other.cls_index and \ - self.doc_span_index == other.doc_span_index and \ - self.end_position == other.end_position and \ - self.example_index == other.example_index and \ + print(self.example_index == other.example_index) + print(self.input_ids == other.input_ids) + print(self.input_mask == other.attention_mask) + print(self.p_mask == other.p_mask) + print(self.paragraph_len == other.paragraph_len) + print(self.segment_ids == other.token_type_ids) + print(self.token_is_max_context == other.token_is_max_context) + print(self.token_to_orig_map == other.token_to_orig_map) + print(self.tokens == other.tokens) + + return self.example_index == other.example_index and \ self.input_ids == other.input_ids and \ - self.input_mask == other.input_mask and \ - self.is_impossible == other.is_impossible and \ + self.input_mask == other.attention_mask and \ self.p_mask == other.p_mask and \ self.paragraph_len == other.paragraph_len and \ - self.segment_ids == other.segment_ids and \ - self.start_position == other.start_position and \ + self.segment_ids == other.token_type_ids and \ self.token_is_max_context == other.token_is_max_context and \ self.token_to_orig_map == other.token_to_orig_map and \ - self.tokens == other.tokens and \ - self.unique_id == other.unique_id \ No newline at end of file + self.tokens == other.tokens \ No newline at end of file From c3ba6452377f085d0f59e15b97ac247bca24367e Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 22 Nov 2019 14:36:49 -0500 Subject: [PATCH 06/26] Works for XLNet --- examples/run_squad.py | 38 ++++-------- transformers/data/processors/squad.py | 84 +++++++++++++-------------- 2 files changed, 50 insertions(+), 72 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index d4219c3096..634b566a46 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -16,6 +16,7 @@ """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function +from transformers.data.processors.squad import SquadV1Processor import argparse import logging @@ -46,8 +47,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples -from utils_squad import (read_squad_examples, convert_examples_to_features, - RawResult, write_predictions, +from utils_squad import (RawResult, write_predictions, RawResultExtended, write_predictions_extended) # The follwing import is the official SQuAD evaluation script (2.0). @@ -289,7 +289,6 @@ def evaluate(args, model, tokenizer, prefix=""): results = evaluate_on_squad(evaluate_options) return results - def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache @@ -308,9 +307,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) - - examples = examples[:10] - features = convert_examples_to_features(examples=examples, + keep_n_examples = 1000 + processor = SquadV1Processor() + values = processor.get_dev_examples("examples/squad") + examples = values[:keep_n_examples] + features = squad_convert_examples_to_features(examples=exampless, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, @@ -320,29 +321,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, cls_token_at_end=True if args.model_type in ['xlnet'] else False, sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) - - exampless = sread_squad_examples(input_file=input_file, - is_training=not evaluate, - version_2_with_negative=args.version_2_with_negative) - exampless = exampless[:10] - features2 = squad_convert_examples_to_features(examples=exampless, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=not evaluate, - cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, - pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, - cls_token_at_end=True if args.model_type in ['xlnet'] else False, - sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) - - print(features2) - - for i in range(len(features)): - assert features[i] == features2[i] - print("Equal") - print("DONE") + + import sys + sys.exit() if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index a0f2408a16..fb3d2ae4d4 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -83,6 +83,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_a_is_doc=False): """Loads a data file into a list of `InputBatch`s.""" + cls_token = tokenizer.cls_token + sep_token = tokenizer.sep_token + # Defining helper methods unique_id = 1000000000 @@ -136,24 +139,24 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair encoded_dict = tokenizer.encode_plus( - truncated_query, - all_doc_tokens, + truncated_query if not sequence_a_is_doc else all_doc_tokens, + all_doc_tokens if not sequence_a_is_doc else truncated_query, max_length=max_seq_length, padding_strategy='right', stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, return_overflowing_tokens=True, - truncation_strategy='only_second' + truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' ) ids = encoded_dict['input_ids'] - print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): - token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[0 + i] + index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i + token_to_orig_map[index] = tok_to_orig_index[0 + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens @@ -164,35 +167,40 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, encoded_dict["length"] = paragraph_len spans.append(encoded_dict) - print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) + # print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) + while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict: - - overflowing_tokens = encoded_dict['overflowing_tokens'] - - print("OVERFLOW", len(overflowing_tokens)) - + overflowing_tokens = encoded_dict["overflowing_tokens"] encoded_dict = tokenizer.encode_plus( - truncated_query, - overflowing_tokens, + truncated_query if not sequence_a_is_doc else overflowing_tokens, + overflowing_tokens if not sequence_a_is_doc else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, padding_strategy='right', stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' + truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' ) - ids = encoded_dict['input_ids'] - print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + # print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + + # print(encoded_dict["input_ids"].index(tokenizer.pad_token_id) if tokenizer.pad_token_id in encoded_dict["input_ids"] else None) + # print(len(spans) * doc_stride, len(all_doc_tokens)) + # Length of the document without the query paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) - non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + if tokenizer.pad_token_id in encoded_dict['input_ids']: + non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + else: + non_padded_ids = encoded_dict['input_ids'] + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): - token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[len(spans) * doc_stride + i] + index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i + token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens @@ -202,23 +210,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len - # split_token_index = doc_span.start + i - # token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - # is_max_context = _check_is_max_context(doc_spans, doc_span_index, - # split_token_index) - # token_is_max_context[len(tokens)] = is_max_context - # tokens.append(all_doc_tokens[split_token_index]) - spans.append(encoded_dict) for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + index = j if sequence_a_is_doc else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j spans[doc_span_index]["token_is_max_context"][index] = is_max_context - print("new span", len(spans)) for span in spans: # Identify the position of the CLS token cls_index = span['input_ids'].index(tokenizer.cls_token_id) @@ -227,17 +226,17 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # Original TF implem also keep the classification token (set to 0) (not sure why...) p_mask = np.array(span['token_type_ids']) - # Convert all SEP indices to '0' before inversion - p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 0 + p_mask = np.minimum(p_mask, 1) - # Limit positive values to one - p_mask = 1 - np.minimum(p_mask, 1) + if not sequence_a_is_doc: + # Limit positive values to one + p_mask = 1 - p_mask + + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 # Set the CLS index to '0' p_mask[cls_index] = 0 - print("new features length", len(new_features)) - new_features.append(NewSquadFeatures( span['input_ids'], span['attention_mask'], @@ -287,19 +286,15 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): - print("OLD DOC CREATION BEGIN", start_offset, len(all_doc_tokens)) length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc + # print("Start offset is", start_offset, len(all_doc_tokens), "length is", length) doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): - print("Done with this doc span, breaking out.", start_offset, length) break - print("CHOOSING OFFSET", length, doc_stride) start_offset += min(length, doc_stride) - print("OLD DOC CREATION END", start_offset) - print("old span", len(doc_spans)) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} @@ -382,7 +377,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, input_mask.append(0 if mask_padding_with_zero else 1) segment_ids.append(pad_token_segment_id) p_mask.append(1) - print("[OLD] Ids computed; position of the first padding", input_ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in input_ids else None) + assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length @@ -440,7 +435,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # logger.info( # "answer: %s" % (answer_text)) - print("features length", len(features)) features.append( SquadFeatures( unique_id=unique_id, @@ -464,10 +458,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, assert len(features) == len(new_features) for i in range(len(features)): - print(i) feature, new_feature = features[i], new_features[i] - input_ids = feature.input_ids + input_ids = [f if f not in [3,4,5] else 0 for f in feature.input_ids ] input_mask = feature.input_mask segment_ids = feature.segment_ids cls_index = feature.cls_index @@ -478,7 +471,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tokens = feature.tokens token_to_orig_map = feature.token_to_orig_map - new_input_ids = new_feature.input_ids + new_input_ids = [f if f not in [3,4,5] else 0 for f in new_feature.input_ids] new_input_mask = new_feature.attention_mask new_segment_ids = new_feature.token_type_ids new_cls_index = new_feature.cls_index @@ -497,6 +490,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, assert example_index == new_example_index assert paragraph_len == new_paragraph_len assert token_is_max_context == new_token_is_max_context + + tokens = [t if tokenizer.convert_tokens_to_ids(t) is not tokenizer.unk_token_id else tokenizer.unk_token for t in tokens] + assert tokens == new_tokens assert token_to_orig_map == new_token_to_orig_map From e0e55bc550a16289763b4f656790e30ed86e428f Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 22 Nov 2019 16:18:18 -0500 Subject: [PATCH 07/26] Manage training example & refactor the refactor --- transformers/data/processors/squad.py | 368 ++++---------------------- 1 file changed, 51 insertions(+), 317 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index fb3d2ae4d4..3d8f48c1bb 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -92,31 +92,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, features = [] new_features = [] for (example_index, example) in enumerate(tqdm(examples)): - - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - - # Split on whitespace so that different tokens may be attributed to their original position. - for c in example.context_text: - if _is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - if is_training: # Get start and end position answer_length = len(example.answer_text) - start_position = char_to_word_offset[example.start_position] - end_position = char_to_word_offset[example.start_position + answer_length - 1] + start_position = example.start_position + end_position = example.end_position # If the answer cannot be found in the text, then skip this example. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) @@ -125,7 +108,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] - for (i, token) in enumerate(doc_tokens): + for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: @@ -138,56 +121,19 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair - encoded_dict = tokenizer.encode_plus( - truncated_query if not sequence_a_is_doc else all_doc_tokens, - all_doc_tokens if not sequence_a_is_doc else truncated_query, - max_length=max_seq_length, - padding_strategy='right', - stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - return_overflowing_tokens=True, - truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' - ) - - ids = encoded_dict['input_ids'] - non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids - paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) - tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) - - token_to_orig_map = {} - for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i - token_to_orig_map[index] = tok_to_orig_index[0 + i] - - encoded_dict["paragraph_len"] = paragraph_len - encoded_dict["tokens"] = tokens - encoded_dict["token_to_orig_map"] = token_to_orig_map - encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens - encoded_dict["token_is_max_context"] = {} - encoded_dict["start"] = 0 - encoded_dict["length"] = paragraph_len - - spans.append(encoded_dict) - # print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) - - while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict: - overflowing_tokens = encoded_dict["overflowing_tokens"] + span_doc_tokens = all_doc_tokens + while len(spans) * doc_stride < len(all_doc_tokens): + encoded_dict = tokenizer.encode_plus( - truncated_query if not sequence_a_is_doc else overflowing_tokens, - overflowing_tokens if not sequence_a_is_doc else truncated_query, + truncated_query if not sequence_a_is_doc else span_doc_tokens, + span_doc_tokens if not sequence_a_is_doc else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, padding_strategy='right', stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' ) - ids = encoded_dict['input_ids'] - # print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) - # print(encoded_dict["input_ids"].index(tokenizer.pad_token_id) if tokenizer.pad_token_id in encoded_dict["input_ids"] else None) - # print(len(spans) * doc_stride, len(all_doc_tokens)) - - - # Length of the document without the query paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) if tokenizer.pad_token_id in encoded_dict['input_ids']: @@ -212,6 +158,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, spans.append(encoded_dict) + if "overflowing_tokens" not in encoded_dict: + break + span_doc_tokens = encoded_dict["overflowing_tokens"] + for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) @@ -254,249 +204,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id += 1 - # tokenize ... - query_tokens = tokenizer.tokenize(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - tok_start_position = None - tok_end_position = None - if is_training and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - # print("Start offset is", start_offset, len(all_doc_tokens), "length is", length) - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - - # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = [] - - # CLS token at the beginning - if not cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = 0 - - # XLNet: P SEP Q SEP CLS - # Others: CLS Q SEP P SEP - if not sequence_a_is_doc: - # Query - tokens += query_tokens - segment_ids += [sequence_a_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - # Paragraph - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - if not sequence_a_is_doc: - segment_ids.append(sequence_b_segment_id) - else: - segment_ids.append(sequence_a_segment_id) - p_mask.append(0) - paragraph_len = doc_span.length - - if sequence_a_is_doc: - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - tokens += query_tokens - segment_ids += [sequence_b_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_b_segment_id) - p_mask.append(1) - - # CLS token at the end - if cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = len(tokens) - 1 # Index of classification token - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(pad_token) - input_mask.append(0 if mask_padding_with_zero else 1) - segment_ids.append(pad_token_segment_id) - p_mask.append(1) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - span_is_impossible = example.is_impossible if hasattr(example, "is_impossible") else False - start_position = None - end_position = None - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - span_is_impossible = True - else: - if sequence_a_is_doc: - doc_offset = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - if is_training and span_is_impossible: - start_position = cls_index - end_position = cls_index - - # if example_index < 20: - # logger.info("*** Example ***") - # logger.info("unique_id: %s" % (unique_id)) - # logger.info("example_index: %s" % (example_index)) - # logger.info("doc_span_index: %s" % (doc_span_index)) - # logger.info("tokens: %s" % str(tokens)) - # logger.info("token_to_orig_map: %s" % " ".join([ - # "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - # logger.info("token_is_max_context: %s" % " ".join([ - # "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - # ])) - # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - # logger.info( - # "input_mask: %s" % " ".join([str(x) for x in input_mask])) - # logger.info( - # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - # if is_training and span_is_impossible: - # logger.info("impossible example") - # if is_training and not span_is_impossible: - # answer_text = " ".join(tokens[start_position:(end_position + 1)]) - # logger.info("start_position: %d" % (start_position)) - # logger.info("end_position: %d" % (end_position)) - # logger.info( - # "answer: %s" % (answer_text)) - - features.append( - SquadFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - cls_index=cls_index, - p_mask=p_mask, - paragraph_len=paragraph_len, - start_position=start_position, - end_position=end_position, - is_impossible=span_is_impossible)) - unique_id += 1 - - assert len(features) == len(new_features) - - assert len(features) == len(new_features) - for i in range(len(features)): - feature, new_feature = features[i], new_features[i] - - input_ids = [f if f not in [3,4,5] else 0 for f in feature.input_ids ] - input_mask = feature.input_mask - segment_ids = feature.segment_ids - cls_index = feature.cls_index - p_mask = feature.p_mask - example_index = feature.example_index - paragraph_len = feature.paragraph_len - token_is_max_context = feature.token_is_max_context - tokens = feature.tokens - token_to_orig_map = feature.token_to_orig_map - - new_input_ids = [f if f not in [3,4,5] else 0 for f in new_feature.input_ids] - new_input_mask = new_feature.attention_mask - new_segment_ids = new_feature.token_type_ids - new_cls_index = new_feature.cls_index - new_p_mask = new_feature.p_mask - new_example_index = new_feature.example_index - new_paragraph_len = new_feature.paragraph_len - new_token_is_max_context = new_feature.token_is_max_context - new_tokens = new_feature.tokens - new_token_to_orig_map = new_feature.token_to_orig_map - - assert input_ids == new_input_ids - assert input_mask == new_input_mask - assert segment_ids == new_segment_ids - assert cls_index == new_cls_index - assert p_mask == new_p_mask - assert example_index == new_example_index - assert paragraph_len == new_paragraph_len - assert token_is_max_context == new_token_is_max_context - - tokens = [t if tokenizer.convert_tokens_to_ids(t) is not tokenizer.unk_token_id else tokenizer.unk_token for t in tokens] - - assert tokens == new_tokens - assert token_to_orig_map == new_token_to_orig_map - - return new_features @@ -592,35 +299,35 @@ class SquadV1Processor(DataProcessor): tensor_dict['title'].numpy().decode('utf-8') ) - def get_train_examples(self, data_dir): + def get_train_examples(self, data_dir, only_first=None): """See base class.""" with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "train") + return self._create_examples(input_data, "train", only_first) - def get_dev_examples(self, data_dir): + def get_dev_examples(self, data_dir, only_first=None): """See base class.""" with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "dev") + return self._create_examples(input_data, "dev", only_first) def get_labels(self): """See base class.""" return ["0", "1"] - def _create_examples(self, input_data, set_type): + def _create_examples(self, input_data, set_type, only_first=None): """Creates examples for the training and dev sets.""" is_training = set_type == "train" examples = [] - for entry in input_data: + for entry in tqdm(input_data): title = entry['title'] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] - start_position = None + start_position_character = None answer_text = None if is_training: if (len(qa["answers"]) != 1): @@ -628,17 +335,20 @@ class SquadV1Processor(DataProcessor): "For training, each question should have exactly 1 answer.") answer = qa["answers"][0] answer_text = answer['text'] - start_position = answer['answer_start'] + start_position_character = answer['answer_start'] example = NewSquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, - start_position=start_position, + start_position_character=start_position_character, title=title ) examples.append(example) + + if only_first is not None and len(examples) > only_first: + return examples return examples @@ -653,14 +363,38 @@ class NewSquadExample(object): question_text, context_text, answer_text, - start_position, + start_position_character, title): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text - self.start_position = start_position self.title = title + self.is_impossible = False + + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + # Split on whitespace so that different tokens may be attributed to their original position. + for c in self.context_text: + if _is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + self.doc_tokens = doc_tokens + self.char_to_word_offset = char_to_word_offset + + # Start end end positions only has a value during evaluation. + if start_position_character is not None: + self.start_position = char_to_word_offset[start_position_character] + self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] class NewSquadFeatures(object): From 0669c1fcd15051ec6fe2d950079886faccf2fb33 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 25 Nov 2019 19:22:21 -0500 Subject: [PATCH 08/26] SQuAD v2 BERT + XLNet --- transformers/__init__.py | 2 +- transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- transformers/data/processors/squad.py | 180 +++++++++++------------ 4 files changed, 92 insertions(+), 94 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 9a767913b3..f3f81f1dbe 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -27,7 +27,7 @@ from .data import (is_sklearn_available, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, squad_convert_examples_to_features, SquadFeatures, - SquadExample, read_squad_examples) + SquadExample) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index 50f2e768f4..b351bf625e 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,6 +1,6 @@ from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples +from .processors import squad_convert_examples_to_features, SquadExample from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 924b4a1245..1e52776629 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples +from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3d8f48c1bb..39ee00ae56 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -46,7 +46,6 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index - def _new_check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" # if len(doc_spans) == 1: @@ -92,7 +91,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, features = [] new_features = [] for (example_index, example) in enumerate(tqdm(examples)): - if is_training: + if is_training and not example.is_impossible: # Get start and end position answer_length = len(example.answer_text) start_position = example.start_position @@ -105,6 +104,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue + tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] @@ -115,6 +115,18 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) + + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) @@ -187,6 +199,34 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # Set the CLS index to '0' p_mask[cls_index] = 0 + + span_is_impossible = example.is_impossible + start_position = 0 + end_position = 0 + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = span["start"] + doc_end = span["start"] + span["length"] - 1 + out_of_span = False + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + + if out_of_span: + start_position = cls_index + end_position = cls_index + span_is_impossible = True + else: + if sequence_a_is_doc: + doc_offset = 0 + else: + doc_offset = len(truncated_query) + sequence_added_tokens + + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + new_features.append(NewSquadFeatures( span['input_ids'], span['attention_mask'], @@ -199,7 +239,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, paragraph_len=span['paragraph_len'], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], - token_to_orig_map=span["token_to_orig_map"] + token_to_orig_map=span["token_to_orig_map"], + + start_position=start_position, + end_position=end_position )) unique_id += 1 @@ -207,86 +250,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, return new_features -def read_squad_examples(input_file, is_training, version_2_with_negative): - """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r", encoding='utf-8') as reader: - input_data = json.load(reader)["data"] - - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in paragraph_text: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - end_position = None - orig_answer_text = None - is_impossible = False - if is_training: - if version_2_with_negative: - is_impossible = qa["is_impossible"] - if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") - if not is_impossible: - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - # Only add answers where the text can be exactly recovered from the - # document. If this CAN'T happen it's likely due to weird Unicode - # stuff so we will just skip the example. - # - # Note that this means for training mode, every example is NOT - # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) - continue - else: - start_position = -1 - end_position = -1 - orig_answer_text = "" - - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_position, - end_position=end_position, - is_impossible=is_impossible) - examples.append(example) - return examples - - -class SquadV1Processor(DataProcessor): +class SquadProcessor(DataProcessor): """Processor for the SQuAD data set.""" + train_file = None + dev_file = None def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" @@ -301,13 +268,19 @@ class SquadV1Processor(DataProcessor): def get_train_examples(self, data_dir, only_first=None): """See base class.""" - with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader: + if self.train_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train", only_first) def get_dev_examples(self, data_dir, only_first=None): """See base class.""" - with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader: + if self.dev_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev", only_first) @@ -329,7 +302,13 @@ class SquadV1Processor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None - if is_training: + + if "is_impossible" in qa: + is_impossible = qa["is_impossible"] + else: + is_impossible = False + + if not is_impossible and is_training: if (len(qa["answers"]) != 1): raise ValueError( "For training, each question should have exactly 1 answer.") @@ -343,15 +322,25 @@ class SquadV1Processor(DataProcessor): context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, - title=title + title=title, + is_impossible=is_impossible ) + examples.append(example) if only_first is not None and len(examples) > only_first: return examples return examples - +class SquadV1Processor(SquadProcessor): + train_file = "train-v1.1.json" + dev_file = "dev-v1.1.json" + + +class SquadV2Processor(SquadProcessor): + train_file = "train-v2.0.json" + dev_file = "dev-v2.0.json" + class NewSquadExample(object): """ @@ -364,13 +353,16 @@ class NewSquadExample(object): context_text, answer_text, start_position_character, - title): + title, + is_impossible=False): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text self.title = title - self.is_impossible = False + self.is_impossible = is_impossible + + self.start_position, self.end_position = 0, 0 doc_tokens = [] char_to_word_offset = [] @@ -392,7 +384,7 @@ class NewSquadExample(object): self.char_to_word_offset = char_to_word_offset # Start end end positions only has a value during evaluation. - if start_position_character is not None: + if start_position_character is not None and not is_impossible: self.start_position = char_to_word_offset[start_position_character] self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] @@ -415,7 +407,10 @@ class NewSquadFeatures(object): paragraph_len, token_is_max_context, tokens, - token_to_orig_map + token_to_orig_map, + + start_position, + end_position ): self.input_ids = input_ids self.attention_mask = attention_mask @@ -430,6 +425,9 @@ class NewSquadFeatures(object): self.tokens = tokens self.token_to_orig_map = token_to_orig_map + self.start_position = start_position + self.end_position = end_position + class SquadExample(object): """ A single training/test example for the Squad dataset. From bd41e8292a4bd7db10eb036112019d93c50adcf5 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 16:03:56 -0500 Subject: [PATCH 09/26] Cleanup & Evaluation now works --- examples/run_squad.py | 44 +++++++++++---------------- transformers/data/processors/squad.py | 14 ++------- 2 files changed, 20 insertions(+), 38 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 634b566a46..545c3ad55a 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -16,7 +16,7 @@ """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function -from transformers.data.processors.squad import SquadV1Processor +from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor import argparse import logging @@ -45,9 +45,9 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples +from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features -from utils_squad import (RawResult, write_predictions, +from utils_squad import (convert_examples_to_features as old_convert, read_squad_examples as old_read, RawResult, write_predictions, RawResultExtended, write_predictions_extended) # The follwing import is the official SQuAD evaluation script (2.0). @@ -304,28 +304,20 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) - examples = read_squad_examples(input_file=input_file, - is_training=not evaluate, - version_2_with_negative=args.version_2_with_negative) - keep_n_examples = 1000 - processor = SquadV1Processor() - values = processor.get_dev_examples("examples/squad") - examples = values[:keep_n_examples] - features = squad_convert_examples_to_features(examples=exampless, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=not evaluate, - cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, - pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, - cls_token_at_end=True if args.model_type in ['xlnet'] else False, - sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) - print("DONE") - import sys - sys.exit() - + processor = SquadV2Processor() + examples = processor.get_dev_examples("examples/squad") if evaluate else processor.get_train_examples("examples/squad") + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + sequence_a_is_doc=True if args.model_type in ['xlnet'] else False + ) + + if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) @@ -335,8 +327,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 39ee00ae56..3d5a3eca80 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -74,26 +74,16 @@ def _is_whitespace(c): def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, - cls_token_at_end=True, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, sequence_a_is_doc=False): """Loads a data file into a list of `InputBatch`s.""" - cls_token = tokenizer.cls_token - sep_token = tokenizer.sep_token - # Defining helper methods unique_id = 1000000000 features = [] - new_features = [] for (example_index, example) in enumerate(tqdm(examples)): if is_training and not example.is_impossible: # Get start and end position - answer_length = len(example.answer_text) start_position = example.start_position end_position = example.end_position @@ -227,7 +217,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = tok_end_position - doc_start + doc_offset - new_features.append(NewSquadFeatures( + features.append(NewSquadFeatures( span['input_ids'], span['attention_mask'], span['token_type_ids'], @@ -247,7 +237,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id += 1 - return new_features + return features class SquadProcessor(DataProcessor): From f671997ef74199823db83ed7b43340764888e129 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 17:17:20 -0500 Subject: [PATCH 10/26] Interface with TFDS --- transformers/data/processors/squad.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3d5a3eca80..52c2c28add 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -246,16 +246,24 @@ class SquadProcessor(DataProcessor): dev_file = None def get_example_from_tensor_dict(self, tensor_dict): - """See base class.""" return NewSquadExample( - tensor_dict['id'].numpy(), + tensor_dict['id'].numpy().decode("utf-8"), tensor_dict['question'].numpy().decode('utf-8'), tensor_dict['context'].numpy().decode('utf-8'), - tensor_dict['answers']['text'].numpy().decode('utf-8'), - tensor_dict['answers']['answers_start'].numpy().decode('utf-8'), + tensor_dict['answers']['text'][0].numpy().decode('utf-8'), + tensor_dict['answers']['answer_start'][0].numpy(), tensor_dict['title'].numpy().decode('utf-8') ) + def get_examples_from_dataset(self, dataset): + """See base class.""" + + examples = [] + for tensor_dict in tqdm(dataset): + examples.append(self.get_example_from_tensor_dict(tensor_dict)) + + return examples + def get_train_examples(self, data_dir, only_first=None): """See base class.""" if self.train_file is None: From 0b84b9fd8a728ca46e4109aa38a11b25f87a09bf Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 17:38:52 -0500 Subject: [PATCH 11/26] Add processors to __init__ --- transformers/__init__.py | 2 +- transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index f3f81f1dbe..aefa3f1921 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -27,7 +27,7 @@ from .data import (is_sklearn_available, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, squad_convert_examples_to_features, SquadFeatures, - SquadExample) + SquadExample, SquadV1Processor, SquadV2Processor) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index b351bf625e..ea3a4e9fbb 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,6 +1,6 @@ from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .processors import squad_convert_examples_to_features, SquadExample +from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 1e52776629..2470e7a06d 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample +from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor From 1e9ac5a7cfeb48ff6a1cf20e07941fc8c59b391d Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 17:43:47 -0500 Subject: [PATCH 12/26] New -> normal --- transformers/data/processors/squad.py | 106 ++------------------------ 1 file changed, 5 insertions(+), 101 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 52c2c28add..f414d41925 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -217,7 +217,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = tok_end_position - doc_start + doc_offset - features.append(NewSquadFeatures( + features.append(SquadFeatures( span['input_ids'], span['attention_mask'], span['token_type_ids'], @@ -246,7 +246,7 @@ class SquadProcessor(DataProcessor): dev_file = None def get_example_from_tensor_dict(self, tensor_dict): - return NewSquadExample( + return SquadExample( tensor_dict['id'].numpy().decode("utf-8"), tensor_dict['question'].numpy().decode('utf-8'), tensor_dict['context'].numpy().decode('utf-8'), @@ -314,7 +314,7 @@ class SquadProcessor(DataProcessor): answer_text = answer['text'] start_position_character = answer['answer_start'] - example = NewSquadExample( + example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, @@ -340,7 +340,7 @@ class SquadV2Processor(SquadProcessor): dev_file = "dev-v2.0.json" -class NewSquadExample(object): +class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. """ @@ -387,7 +387,7 @@ class NewSquadExample(object): self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] -class NewSquadFeatures(object): +class SquadFeatures(object): """ Single squad example features to be fed to a model. Those features are model-specific. @@ -425,99 +425,3 @@ class NewSquadFeatures(object): self.start_position = start_position self.end_position = end_position - -class SquadExample(object): - """ - A single training/test example for the Squad dataset. - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.end_position: - s += ", end_position: %d" % (self.end_position) - if self.is_impossible: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class SquadFeatures(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.cls_index = cls_index - self.p_mask = p_mask - self.paragraph_len = paragraph_len - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __eq__(self, other): - print(self.example_index == other.example_index) - print(self.input_ids == other.input_ids) - print(self.input_mask == other.attention_mask) - print(self.p_mask == other.p_mask) - print(self.paragraph_len == other.paragraph_len) - print(self.segment_ids == other.token_type_ids) - print(self.token_is_max_context == other.token_is_max_context) - print(self.token_to_orig_map == other.token_to_orig_map) - print(self.tokens == other.tokens) - - return self.example_index == other.example_index and \ - self.input_ids == other.input_ids and \ - self.input_mask == other.attention_mask and \ - self.p_mask == other.p_mask and \ - self.paragraph_len == other.paragraph_len and \ - self.segment_ids == other.token_type_ids and \ - self.token_is_max_context == other.token_is_max_context and \ - self.token_to_orig_map == other.token_to_orig_map and \ - self.tokens == other.tokens \ No newline at end of file From 285b1241e38cdafb6b0dadd1d1afc19493318074 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 3 Dec 2019 15:00:49 -0500 Subject: [PATCH 13/26] Added SquadResult --- transformers/data/processors/squad.py | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index f414d41925..afbe4270f5 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -425,3 +425,74 @@ class SquadFeatures(object): self.start_position = start_position self.end_position = end_position + + + +class SquadResult(object): + """ + Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. + + Args: + result: The result output by a model on a SQuAD inference. These results may be complex (5 values) as the ones output by + XLNet or XLM or may be simple like the other models (2 values). They may be passed as a list or as a dict, with the + following accepted formats: + + `dict` output by a simple model: + { + "start_logits": int, + "end_logits": int, + "unique_id": string + } + `list` output by a simple model: + [start_logits, end_logits, unique_id] + + `dict` output by a complex model: + { + "start_top_log_probs": float, + "start_top_index": int, + "end_top_log_probs": float, + "end_top_index": int, + "cls_logits": int, + "unique_id": string + } + `list` output by a complex model: + [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, unique_id] + + See `run_squad.py` for an example. + """ + def __init__(self, result): + if isinstance(result, dict): + if "start_logits" in result and "end_logits" in result: + self.start_logits = result["start_logits"] + self.end_logits = result["end_logits"] + + elif "start_top_log_probs" in result and "start_top_index" in result: + self.start_top_log_probs = result["start_top_log_probs"] + self.start_top_index = result["start_top_index"] + self.end_top_log_probs = result["end_top_log_probs"] + self.end_top_index = result["end_top_index"] + self.cls_logits = result["cls_logits"] + + else: + raise ValueError("SquadResult instantiated with wrong values.") + + self.unique_id = result["unique_id"] + elif isinstance(result, list): + if len(result) == 3: + self.start_logits = result[0] + self.end_logits = result[1] + + elif len(result) == 6: + self.start_top_log_probs = result[0] + self.start_top_index = result[1] + self.end_top_log_probs = result[2] + self.end_top_index = result[3] + self.cls_logits = result[4] + + else: + raise ValueError("SquadResult instantiated with wrong values.") + + self.unique_id = result[-1] + + else: + raise ValueError("SquadResult instantiated with wrong values. Should be a dictionary or a list.") From c835bc85c2f51f4da5eab4f1481a25b052bf6d61 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 3 Dec 2019 15:28:16 -0500 Subject: [PATCH 14/26] Compute predictions --- transformers/data/metrics/squad_metrics.py | 335 +++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 transformers/data/metrics/squad_metrics.py diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py new file mode 100644 index 0000000000..d4c5a8ec5b --- /dev/null +++ b/transformers/data/metrics/squad_metrics.py @@ -0,0 +1,335 @@ +import json +import logging +import math +import collections +from io import open +from tqdm import tqdm + +from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize + +logger = logging.getLogger(__name__) + + +def compute_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, verbose_logging, + version_2_with_negative, null_score_diff_threshold): + """Write final predictions to the json file and log-odds of null if needed.""" + logger.info("Writing predictions to: %s" % (output_prediction_file)) + logger.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min null score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + # if we didn't include the empty option in the n-best, include it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + + # In very rare edge cases we could only have single null prediction. + # So we just create a nonce prediction in this case to avoid failure. + if len(nbest)==1: + nbest.insert(0, + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + all_nbest_json[example.qas_id] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs From de276de1c1a469a58a25383a35a239d02459a978 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 3 Dec 2019 17:15:51 -0500 Subject: [PATCH 15/26] Working evaluation --- examples/run_squad.py | 43 +- transformers/data/metrics/squad_metrics.py | 588 +++++++++++++++++---- transformers/data/processors/squad.py | 19 +- 3 files changed, 507 insertions(+), 143 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 545c3ad55a..b7952487dc 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -16,7 +16,8 @@ """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function -from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor +from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult +from transformers.data.metrics.squad_metrics import compute_predictions, compute_predictions_extended, squad_evaluate import argparse import logging @@ -230,9 +231,11 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1] - } + inputs = { + 'input_ids': batch[0], + 'attention_mask': batch[1] + } + if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids example_indices = batch[3] @@ -244,18 +247,8 @@ def evaluate(args, model, tokenizer, prefix=""): for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args.model_type in ['xlnet', 'xlm']: - # XLNet uses a more complex post-processing procedure - result = RawResultExtended(unique_id = unique_id, - start_top_log_probs = to_list(outputs[0][i]), - start_top_index = to_list(outputs[1][i]), - end_top_log_probs = to_list(outputs[2][i]), - end_top_index = to_list(outputs[3][i]), - cls_logits = to_list(outputs[4][i])) - else: - result = RawResult(unique_id = unique_id, - start_logits = to_list(outputs[0][i]), - end_logits = to_list(outputs[1][i])) + + result = SquadResult([to_list(output[i]) for output in outputs] + [unique_id]) all_results.append(result) evalTime = timeit.default_timer() - start_time @@ -271,22 +264,18 @@ def evaluate(args, model, tokenizer, prefix=""): if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - write_predictions_extended(examples, features, all_results, args.n_best_size, + predictions = compute_predictions_extended(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: - write_predictions(examples, features, all_results, args.n_best_size, + predictions = compute_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) - # Evaluate with the official SQuAD script - evaluate_options = EVAL_OPTS(data_file=args.predict_file, - pred_file=output_prediction_file, - na_prob_file=output_null_log_odds_file) - results = evaluate_on_squad(evaluate_options) + results = squad_evaluate(examples, predictions) return results def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): @@ -306,8 +295,12 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal logger.info("Creating features from dataset file at %s", input_file) processor = SquadV2Processor() - examples = processor.get_dev_examples("examples/squad") if evaluate else processor.get_train_examples("examples/squad") - features = squad_convert_examples_to_features( + examples = processor.get_dev_examples("examples/squad", only_first=100) if evaluate else processor.get_train_examples("examples/squad") + # import tensorflow_datasets as tfds + # tfds_examples = tfds.load("squad") + # examples = SquadV1Processor().get_examples_from_dataset(tfds_examples["validation"]) + + features = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index d4c5a8ec5b..83647a20d0 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -1,15 +1,323 @@ +""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was +modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 + +In addition to basic functionality, we also compute additional statistics and +plot precision-recall curves if an additional na_prob.json file is provided. +This file is expected to map question ID's to the model's predicted probability +that a question is unanswerable. +""" + + import json import logging import math import collections from io import open from tqdm import tqdm +import string +import re from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize logger = logging.getLogger(__name__) +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def get_raw_scores(examples, preds): + """ + Computes the exact and f1 scores from the examples and the model predictions + """ + exact_scores = {} + f1_scores = {} + + for example in examples: + qas_id = example.qas_id + gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])] + + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [''] + + if qas_id not in preds: + print('Missing prediction for %s' % qas_id) + continue + + prediction = preds[qas_id] + exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers) + f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers) + + return exact_scores, f1_scores + + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores.values()) / total), + ('f1', 100.0 * sum(f1_scores.values()) / total), + ('total', total), + ]) + else: + total = len(qid_list) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ('total', total), + ]) + + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval['%s_%s' % (prefix, k)] = new_eval[k] + + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for _, qid in enumerate(qid_list): + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh + + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) + + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + + +def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0): + qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples} + has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer] + no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer] + + if no_answer_probs is None: + no_answer_probs = {k: 0.0 for k in preds} + + exact, f1 = get_raw_scores(examples, preds) + + exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) + f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) + + evaluation = make_eval_dict(exact_threshold, f1_threshold) + + if has_answer_qids: + has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids) + merge_eval(evaluation, has_ans_eval, 'HasAns') + + if no_answer_qids: + no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids) + merge_eval(evaluation, no_ans_eval, 'NoAns') + + if no_answer_probs: + find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer) + + return evaluation + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + def compute_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, @@ -204,132 +512,192 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, return all_predictions -def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): - """Project the tokenized prediction back to the original text.""" +def compute_predictions_extended(all_examples, all_features, all_results, n_best_size, + max_answer_length, output_prediction_file, + output_nbest_file, + output_null_log_odds_file, orig_data_file, + start_n_top, end_n_top, version_2_with_negative, + tokenizer, verbose_logging): + """ XLNet write prediction logic (more complex than Bert's). + Write final predictions to the json file and log-odds of null if needed. - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the SQuAD eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heuristic between - # `pred_text` and `orig_text` to get a character-to-character alignment. This - # can fail in certain cases in which case we just return `orig_text`. + Requires utils_squad_evaluate.py + """ + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + + logger.info("Writing predictions to: %s", output_prediction_file) + # logger.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + + cur_null_score = result.cls_logits + + # if we could have irrelevant answers, get the min score of irrelevant + score_null = min(score_null, cur_null_score) + + for i in range(start_n_top): + for j in range(end_n_top): + start_log_prob = result.start_top_log_probs[i] + start_index = result.start_top_index[i] + + j_index = i * end_n_top + j + + end_log_prob = result.end_top_log_probs[j_index] + end_index = result.end_top_index[j_index] + + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= feature.paragraph_len - 1: + continue + if end_index >= feature.paragraph_len - 1: + continue + + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + + # XLNet un-tokenizer + # Let's keep it simple for now and see if we need all this later. + # + # tok_start_to_orig_index = feature.tok_start_to_orig_index + # tok_end_to_orig_index = feature.tok_end_to_orig_index + # start_orig_pos = tok_start_to_orig_index[pred.start_index] + # end_orig_pos = tok_end_to_orig_index[pred.end_index] + # paragraph_text = example.paragraph_text + # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + + # Previously used Bert untokenizer + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, + verbose_logging) + + if final_text in seen_predictions: continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + seen_predictions[final_text] = True - tok_text = " ".join(tokenizer.tokenize(orig_text)) + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) - start_position = tok_text.find(pred_text) - if start_position == -1: - if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="", start_log_prob=-1e6, + end_log_prob=-1e6)) - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + best_non_null_entry = entry - if len(orig_ns_text) != len(tok_ns_text): - if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text + probs = _compute_softmax(total_scores) - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): - tok_s_to_ns_map[tok_index] = i + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] + assert len(nbest_json) >= 1 + assert best_non_null_entry is not None - if orig_start_position is None: - if verbose_logging: - logger.info("Couldn't map start position") - return orig_text + score_diff = score_null + scores_diff_json[example.qas_id] = score_diff + # note(zhiliny): always predict best_non_null_entry + # and the evaluation script will search for the best threshold + all_predictions[example.qas_id] = best_non_null_entry.text - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] + all_nbest_json[example.qas_id] = nbest_json - if orig_end_position is None: - if verbose_logging: - logger.info("Couldn't map end position") - return orig_text + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") -def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + with open(orig_data_file, "r", encoding='utf-8') as reader: + orig_data = json.load(reader)["data"] - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes + qid_to_has_ans = make_qid_to_has_ans(orig_data) + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) + out_eval = {} + find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs + return out_eval diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index afbe4270f5..70dc9faf54 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -306,13 +306,13 @@ class SquadProcessor(DataProcessor): else: is_impossible = False - if not is_impossible and is_training: - if (len(qa["answers"]) != 1): - raise ValueError( - "For training, each question should have exactly 1 answer.") - answer = qa["answers"][0] - answer_text = answer['text'] - start_position_character = answer['answer_start'] + if not is_impossible: + if is_training: + answer = qa["answers"][0] + answer_text = answer['text'] + start_position_character = answer['answer_start'] + else: + answers = qa["answers"] example = SquadExample( qas_id=qas_id, @@ -321,7 +321,8 @@ class SquadProcessor(DataProcessor): answer_text=answer_text, start_position_character=start_position_character, title=title, - is_impossible=is_impossible + is_impossible=is_impossible, + answers=answers ) examples.append(example) @@ -352,6 +353,7 @@ class SquadExample(object): answer_text, start_position_character, title, + answers=None, is_impossible=False): self.qas_id = qas_id self.question_text = question_text @@ -359,6 +361,7 @@ class SquadExample(object): self.answer_text = answer_text self.title = title self.is_impossible = is_impossible + self.answers = answers self.start_position, self.end_position = 0, 0 From 9ddc3f1a1227fc9cbe4e5a5c20b21546e438dfb1 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 10:37:00 -0500 Subject: [PATCH 16/26] Naming update + XLNet/XLM evaluation --- examples/run_squad.py | 6 +- transformers/data/metrics/squad_metrics.py | 97 ++++++++++++++++++---- 2 files changed, 85 insertions(+), 18 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index b7952487dc..a9ef5c6ba2 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult -from transformers.data.metrics.squad_metrics import compute_predictions, compute_predictions_extended, squad_evaluate +from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate import argparse import logging @@ -264,13 +264,13 @@ def evaluate(args, model, tokenizer, prefix=""): if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - predictions = compute_predictions_extended(examples, features, all_results, args.n_best_size, + predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: - predictions = compute_predictions(examples, features, all_results, args.n_best_size, + predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 83647a20d0..1f120d354a 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -125,6 +125,53 @@ def merge_eval(main_eval, new_eval, prefix): main_eval['%s_%s' % (prefix, k)] = new_eval[k] +def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + + has_ans_score, has_ans_cnt = 0, 0 + for qid in qid_list: + if not qid_to_has_ans[qid]: + continue + has_ans_cnt += 1 + + if qid not in scores: + continue + has_ans_score += scores[qid] + + return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt + + +def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2( + preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2( + preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + main_eval['has_ans_exact'] = has_ans_exact + main_eval['has_ans_f1'] = has_ans_f1 + + def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) cur_score = num_no_ans @@ -318,10 +365,20 @@ def _compute_softmax(scores): return probs -def compute_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): +def compute_predictions_logits( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + verbose_logging, + version_2_with_negative, + null_score_diff_threshold +): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -450,12 +507,12 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, text="", start_logit=null_start_logit, end_logit=null_end_logit)) - + # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. - if len(nbest)==1: + if len(nbest) == 1: nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. @@ -512,12 +569,22 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, return all_predictions -def compute_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, output_prediction_file, - output_nbest_file, - output_null_log_odds_file, orig_data_file, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): +def compute_predictions_log_probs( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + orig_data_file, + start_n_top, + end_n_top, + version_2_with_negative, + tokenizer, + verbose_logging +): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. @@ -526,7 +593,7 @@ def compute_predictions_extended(all_examples, all_features, all_results, n_best _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) + "start_log_prob", "end_log_prob"]) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) @@ -609,7 +676,7 @@ def compute_predictions_extended(all_examples, all_features, all_results, n_best # XLNet un-tokenizer # Let's keep it simple for now and see if we need all this later. - # + # # tok_start_to_orig_index = feature.tok_start_to_orig_index # tok_end_to_orig_index = feature.tok_end_to_orig_index # start_orig_pos = tok_start_to_orig_index[pred.start_index] From bf119c0568dfc1ea5ce0a34359e33ca002266e96 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 11:34:59 -0500 Subject: [PATCH 17/26] TFDS dataset can now be evaluated --- transformers/data/processors/squad.py | 34 ++++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 70dc9faf54..2e50ac8a8c 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -245,22 +245,37 @@ class SquadProcessor(DataProcessor): train_file = None dev_file = None - def get_example_from_tensor_dict(self, tensor_dict): + def get_example_from_tensor_dict(self, tensor_dict, evaluate=False): + + if not evaluate: + answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') + answer_start = tensor_dict['answers']['answer_start'][0].numpy() + answers = None + else: + answers = [{ + "answer_start": start.numpy(), + "text": text.numpy().decode('utf-8') + } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])] + + answer = None + answer_start = None + return SquadExample( - tensor_dict['id'].numpy().decode("utf-8"), - tensor_dict['question'].numpy().decode('utf-8'), - tensor_dict['context'].numpy().decode('utf-8'), - tensor_dict['answers']['text'][0].numpy().decode('utf-8'), - tensor_dict['answers']['answer_start'][0].numpy(), - tensor_dict['title'].numpy().decode('utf-8') + qas_id=tensor_dict['id'].numpy().decode("utf-8"), + question_text=tensor_dict['question'].numpy().decode('utf-8'), + context_text=tensor_dict['context'].numpy().decode('utf-8'), + answer_text=answer, + start_position_character=answer_start, + title=tensor_dict['title'].numpy().decode('utf-8'), + answers=answers ) - def get_examples_from_dataset(self, dataset): + def get_examples_from_dataset(self, dataset, evaluate=False): """See base class.""" examples = [] for tensor_dict in tqdm(dataset): - examples.append(self.get_example_from_tensor_dict(tensor_dict)) + examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples @@ -300,6 +315,7 @@ class SquadProcessor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None + answers = None if "is_impossible" in qa: is_impossible = qa["is_impossible"] From cca75e788485e8a2a1c44a445c6aba0fb2dfaf56 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 15:42:29 -0500 Subject: [PATCH 18/26] Kill the demon spawn --- examples/run_squad.py | 23 +++++++- transformers/data/processors/squad.py | 75 +++++---------------------- 2 files changed, 34 insertions(+), 64 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index a9ef5c6ba2..2f86322196 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -248,7 +248,28 @@ def evaluate(args, model, tokenizer, prefix=""): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - result = SquadResult([to_list(output[i]) for output in outputs] + [unique_id]) + output = [to_list(output[i]) for output in outputs] + + if len(output) >= 5: + start_logits = output[0] + start_top_index = output[1] + end_logits = output[2] + end_top_index = output[3], + cls_logits = output[4] + + result = SquadResult( + unique_id, start_logits, end_logits, + start_top_index=start_top_index, + end_top_index=end_top_index, + cls_logits=cls_logits + ) + + else: + start_logits, end_logits = output + result = SquadResult( + unique_id, start_logits, end_logits + ) + all_results.append(result) evalTime = timeit.default_timer() - start_time diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 2e50ac8a8c..9306189eb4 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -446,72 +446,21 @@ class SquadFeatures(object): self.end_position = end_position - class SquadResult(object): """ Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. Args: - result: The result output by a model on a SQuAD inference. These results may be complex (5 values) as the ones output by - XLNet or XLM or may be simple like the other models (2 values). They may be passed as a list or as a dict, with the - following accepted formats: - - `dict` output by a simple model: - { - "start_logits": int, - "end_logits": int, - "unique_id": string - } - `list` output by a simple model: - [start_logits, end_logits, unique_id] - - `dict` output by a complex model: - { - "start_top_log_probs": float, - "start_top_index": int, - "end_top_log_probs": float, - "end_top_index": int, - "cls_logits": int, - "unique_id": string - } - `list` output by a complex model: - [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, unique_id] - - See `run_squad.py` for an example. + unique_id: The unique identifier corresponding to that example. + start_logits: The logits corresponding to the start of the answer + end_logits: The logits corresponding to the end of the answer """ - def __init__(self, result): - if isinstance(result, dict): - if "start_logits" in result and "end_logits" in result: - self.start_logits = result["start_logits"] - self.end_logits = result["end_logits"] - - elif "start_top_log_probs" in result and "start_top_index" in result: - self.start_top_log_probs = result["start_top_log_probs"] - self.start_top_index = result["start_top_index"] - self.end_top_log_probs = result["end_top_log_probs"] - self.end_top_index = result["end_top_index"] - self.cls_logits = result["cls_logits"] - - else: - raise ValueError("SquadResult instantiated with wrong values.") - - self.unique_id = result["unique_id"] - elif isinstance(result, list): - if len(result) == 3: - self.start_logits = result[0] - self.end_logits = result[1] - - elif len(result) == 6: - self.start_top_log_probs = result[0] - self.start_top_index = result[1] - self.end_top_log_probs = result[2] - self.end_top_index = result[3] - self.cls_logits = result[4] - - else: - raise ValueError("SquadResult instantiated with wrong values.") - - self.unique_id = result[-1] - - else: - raise ValueError("SquadResult instantiated with wrong values. Should be a dictionary or a list.") + def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): + self.start_top_log_probs = start_logits + self.end_top_log_probs = end_logits + self.unique_id = unique_id + + if start_top_index: + self.start_top_index = start_top_index + self.end_top_index = end_top_index + self.cls_logits = cls_logits \ No newline at end of file From a7ca6d738b7801c680bd25d9e910f962d3f8bf2d Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 15:43:34 -0500 Subject: [PATCH 19/26] Padding side is tokenizer-dependant --- transformers/data/processors/squad.py | 11 ++-- .../tests/tokenization_tests_commons.py | 21 +++++-- transformers/tokenization_utils.py | 60 ++++++++++++------- transformers/tokenization_xlnet.py | 1 + 4 files changed, 58 insertions(+), 35 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 9306189eb4..6599c54330 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -73,8 +73,7 @@ def _is_whitespace(c): return False def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - sequence_a_is_doc=False): + doc_stride, max_query_length, is_training): """Loads a data file into a list of `InputBatch`s.""" # Defining helper methods @@ -127,13 +126,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( - truncated_query if not sequence_a_is_doc else span_doc_tokens, - span_doc_tokens if not sequence_a_is_doc else truncated_query, + truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, + span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, - padding_strategy='right', + pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' + truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' ) paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index 40d68d0ab2..6592005c67 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -344,17 +344,19 @@ class CommonTestCases: padding_idx = tokenizer.pad_token_id # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right') + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert encoded_sequence + [padding_idx] * padding_size == padded_sequence # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left') + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert [padding_idx] * padding_size + encoded_sequence == padded_sequence @@ -362,10 +364,15 @@ class CommonTestCases: # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right') + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_right_length = len(padded_sequence_right) - padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left') + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_right_length assert encoded_sequence == padded_sequence_right assert sequence_length == padded_sequence_left_length @@ -387,7 +394,8 @@ class CommonTestCases: sequence_length = len(input_ids) # Test right padding - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True) + tokenizer.padding_side = "right" + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) padded_input_ids = padded_sequence['input_ids'] padded_token_type_ids = padded_sequence['token_type_ids'] padded_attention_mask = padded_sequence['attention_mask'] @@ -401,7 +409,8 @@ class CommonTestCases: assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask # Test left padding - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True) + tokenizer.padding_side = "left" + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) padded_input_ids = padded_sequence['input_ids'] padded_token_type_ids = padded_sequence['token_type_ids'] padded_attention_mask = padded_sequence['attention_mask'] diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index dbbabd0e1a..41a611ea49 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -77,6 +77,8 @@ class PreTrainedTokenizer(object): "pad_token", "cls_token", "mask_token", "additional_special_tokens"] + padding_side = "right" + @property def bos_token(self): """ Beginning of sentence token (string). Log an error if used while not having been set. """ @@ -223,6 +225,9 @@ class PreTrainedTokenizer(object): self.max_len = max_len if max_len is not None else int(1e12) + # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed. + self.padding_side = kwargs.pop('padding_side', self.padding_side) + # Added tokens self.added_tokens_encoder = {} self.added_tokens_decoder = {} @@ -702,7 +707,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - padding_strategy=None, + pad_to_max_length=False, return_tensors=None, **kwargs): """ @@ -729,12 +734,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's - padding index, up to their max length. If no max length is specified, no padding is done. - The strategies are handled by the following strings: + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences - Defaults to None: no padding. + Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -745,7 +750,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, - padding_strategy=padding_strategy, + pad_to_max_length=pad_to_max_length, return_tensors=return_tensors, **kwargs) @@ -758,7 +763,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - padding_strategy=None, + pad_to_max_length=False, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -788,12 +793,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's - padding index, up to their max length. If no max length is specified, no padding is done. - The strategies are handled by the following strings: + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences - Defaults to None: no padding. + Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -841,7 +846,7 @@ class PreTrainedTokenizer(object): return self.prepare_for_model(first_ids, pair_ids=second_ids, max_length=max_length, - padding_strategy=padding_strategy, + pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, @@ -853,7 +858,7 @@ class PreTrainedTokenizer(object): def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, truncation_strategy='longest_first', - padding_strategy=None, + pad_to_max_length=False, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -881,12 +886,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's - padding index, up to their max length. If no max length is specified, no padding is done. - The strategies are handled by the following strings: + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - - 'right': pads on the right of the sequences - Defaults to None: no padding. + - 'right': pads on the right of the sequences + Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -955,10 +960,19 @@ class PreTrainedTokenizer(object): "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.max_len)) - if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length: - difference = max_length - len(encoded_inputs["input_ids"]) + needs_to_be_padded = pad_to_max_length and ( + max_length and len(encoded_inputs["input_ids"]) < max_length + or + max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000 + ) - if padding_strategy == 'right': + if pad_to_max_length and max_length is None and self.max_len > 10000: + logger.warning("Sequence can't be padded as the maximum ") + + if needs_to_be_padded: + difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"]) + + if self.padding_side == 'right': if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference if return_token_type_ids: @@ -967,7 +981,7 @@ class PreTrainedTokenizer(object): encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference - elif padding_strategy == 'left': + elif self.padding_side == 'left': if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) if return_token_type_ids: @@ -977,7 +991,7 @@ class PreTrainedTokenizer(object): encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: - raise ValueError("Invalid padding strategy:" + str(padding_strategy)) + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) elif return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index 3ea71f4438..1c43c0943a 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + padding_side = "left" def __init__(self, vocab_file, do_lower_case=False, remove_space=True, keep_accents=False, From f7e4a7cdfa6bcf6ec7c33fd1d40d307278b1c13a Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 16:24:15 -0500 Subject: [PATCH 20/26] Cleanup --- examples/run_squad.py | 32 ++-- examples/test_examples.py | 3 +- .../{dev-v2.0-small.json => dev-v2.0.json} | 0 examples/tests_samples/SQUAD/train-v2.0.json | 140 ++++++++++++++++++ transformers/data/metrics/squad_metrics.py | 4 +- transformers/data/processors/squad.py | 36 ++++- 6 files changed, 191 insertions(+), 24 deletions(-) rename examples/tests_samples/SQUAD/{dev-v2.0-small.json => dev-v2.0.json} (100%) create mode 100644 examples/tests_samples/SQUAD/train-v2.0.json diff --git a/examples/run_squad.py b/examples/run_squad.py index 2f86322196..3f1b6a798f 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -304,8 +304,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file - input_file = args.predict_file if evaluate else args.train_file - cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( + input_dir = args.data_dir if args.data_dir else "." + cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) @@ -313,13 +313,22 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: - logger.info("Creating features from dataset file at %s", input_file) + logger.info("Creating features from dataset file at %s", input_dir) - processor = SquadV2Processor() - examples = processor.get_dev_examples("examples/squad", only_first=100) if evaluate else processor.get_train_examples("examples/squad") - # import tensorflow_datasets as tfds - # tfds_examples = tfds.load("squad") - # examples = SquadV1Processor().get_examples_from_dataset(tfds_examples["validation"]) + if not args.data_dir: + try: + import tensorflow_datasets as tfds + except ImportError: + raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") + + if args.version_2_with_negative: + logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") + + tfds_examples = tfds.load("squad") + examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) + else: + processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() + examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = squad_convert_examples_to_features( examples=examples, @@ -328,7 +337,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, - sequence_a_is_doc=True if args.model_type in ['xlnet'] else False ) @@ -365,10 +373,6 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--train_file", default=None, type=str, required=True, - help="SQuAD json for training. E.g., train-v1.1.json") - parser.add_argument("--predict_file", default=None, type=str, required=True, - help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, @@ -377,6 +381,8 @@ def main(): help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters + parser.add_argument("--data_dir", default=None, type=str, + help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.") parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, diff --git a/examples/test_examples.py b/examples/test_examples.py index b04d722b7b..632d2f728e 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase): logger.addHandler(stream_handler) testargs = ["run_squad.py", - "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", - "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", + "--data_dir=./examples/tests_samples/SQUAD", "--model_name=bert-base-uncased", "--output_dir=./examples/tests_samples/temp_dir", "--max_steps=10", diff --git a/examples/tests_samples/SQUAD/dev-v2.0-small.json b/examples/tests_samples/SQUAD/dev-v2.0.json similarity index 100% rename from examples/tests_samples/SQUAD/dev-v2.0-small.json rename to examples/tests_samples/SQUAD/dev-v2.0.json diff --git a/examples/tests_samples/SQUAD/train-v2.0.json b/examples/tests_samples/SQUAD/train-v2.0.json new file mode 100644 index 0000000000..834d9ee660 --- /dev/null +++ b/examples/tests_samples/SQUAD/train-v2.0.json @@ -0,0 +1,140 @@ +{ + "version": "v2.0", + "data": [{ + "title": "Normans", + "paragraphs": [{ + "qas": [{ + "question": "In what country is Normandy located?", + "id": "56ddde6b9a695914005b9628", + "answers": [{ + "text": "France", + "answer_start": 159 + }], + "is_impossible": false + }, { + "question": "When were the Normans in Normandy?", + "id": "56ddde6b9a695914005b9629", + "answers": [{ + "text": "10th and 11th centuries", + "answer_start": 94 + }], + "is_impossible": false + }, { + "question": "From which countries did the Norse originate?", + "id": "56ddde6b9a695914005b962a", + "answers": [{ + "text": "Denmark, Iceland and Norway", + "answer_start": 256 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "Rollo", + "answer_start": 308 + }], + "question": "Who did King Charles III swear fealty to?", + "id": "5ad39d53604f3c001a3fe8d3", + "answers": [], + "is_impossible": true + }, { + "plausible_answers": [{ + "text": "10th century", + "answer_start": 671 + }], + "question": "When did the Frankish identity emerge?", + "id": "5ad39d53604f3c001a3fe8d4", + "answers": [], + "is_impossible": true + }], + "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries." + }, { + "qas": [{ + "question": "Who was the duke in the battle of Hastings?", + "id": "56dddf4066d3e219004dad5f", + "answers": [{ + "text": "William the Conqueror", + "answer_start": 1022 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "Antioch", + "answer_start": 1295 + }], + "question": "What principality did William the conquerer found?", + "id": "5ad3a266604f3c001a3fea2b", + "answers": [], + "is_impossible": true + }], + "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands." + }] + }, { + "title": "Computational_complexity_theory", + "paragraphs": [{ + "qas": [{ + "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?", + "id": "56e16182e3433e1400422e28", + "answers": [{ + "text": "Computational complexity theory", + "answer_start": 0 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "algorithm", + "answer_start": 472 + }], + "question": "What is a manual application of mathematical steps?", + "id": "5ad5316b5b96ef001a10ab76", + "answers": [], + "is_impossible": true + }], + "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm." + }, { + "qas": [{ + "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?", + "id": "56e16839cd28a01900c67887", + "answers": [{ + "text": "if its solution requires significant resources", + "answer_start": 46 + }], + "is_impossible": false + }, { + "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?", + "id": "56e16839cd28a01900c67888", + "answers": [{ + "text": "mathematical models of computation", + "answer_start": 176 + }], + "is_impossible": false + }, { + "question": "What are two basic primary resources used to guage complexity?", + "id": "56e16839cd28a01900c67889", + "answers": [{ + "text": "time and storage", + "answer_start": 305 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "the number of gates in a circuit", + "answer_start": 436 + }], + "question": "What unit is measured to determine circuit simplicity?", + "id": "5ad532575b96ef001a10ab7f", + "answers": [], + "is_impossible": true + }, { + "plausible_answers": [{ + "text": "the number of processors", + "answer_start": 502 + }], + "question": "What number is used in perpendicular computing?", + "id": "5ad532575b96ef001a10ab80", + "answers": [], + "is_impossible": true + }], + "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do." + }] + }] +} \ No newline at end of file diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 1f120d354a..f8449df045 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -630,12 +630,12 @@ def compute_predictions_log_probs( for i in range(start_n_top): for j in range(end_n_top): - start_log_prob = result.start_top_log_probs[i] + start_log_prob = result.start_logits[i] start_index = result.start_top_index[i] j_index = i * end_n_top + j - end_log_prob = result.end_top_log_probs[j_index] + end_log_prob = result.end_logits[j_index] end_index = result.end_top_index[j_index] # We could hypothetically create invalid predictions, e.g., predict diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 6599c54330..dd2d9d25c0 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -146,7 +146,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, token_to_orig_map = {} for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len @@ -166,7 +166,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = j if sequence_a_is_doc else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: @@ -179,7 +179,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, p_mask = np.minimum(p_mask, 1) - if not sequence_a_is_doc: + if tokenizer.padding_side == "right": # Limit positive values to one p_mask = 1 - p_mask @@ -207,7 +207,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = cls_index span_is_impossible = True else: - if sequence_a_is_doc: + if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens @@ -270,7 +270,29 @@ class SquadProcessor(DataProcessor): ) def get_examples_from_dataset(self, dataset, evaluate=False): - """See base class.""" + """ + Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset. + + Args: + dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")` + evaluate: boolean specifying if in evaluation mode or in training mode + + Returns: + List of SquadExample + + Examples:: + + import tensorflow_datasets as tfds + dataset = tfds.load("squad") + + training_examples = get_examples_from_dataset(dataset, evaluate=False) + evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) + """ + + if evaluate: + dataset = dataset["validation"] + else: + dataset = dataset["train"] examples = [] for tensor_dict in tqdm(dataset): @@ -455,8 +477,8 @@ class SquadResult(object): end_logits: The logits corresponding to the end of the answer """ def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): - self.start_top_log_probs = start_logits - self.end_top_log_probs = end_logits + self.start_logits = start_logits + self.end_logits = end_logits self.unique_id = unique_id if start_top_index: From 33508ae310f101a2534d3e97ea23fda93e25ef38 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 16:26:45 -0500 Subject: [PATCH 21/26] Remove `only_first` --- transformers/data/processors/squad.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index dd2d9d25c0..09a79db471 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -300,29 +300,29 @@ class SquadProcessor(DataProcessor): return examples - def get_train_examples(self, data_dir, only_first=None): + def get_train_examples(self, data_dir): """See base class.""" if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "train", only_first) + return self._create_examples(input_data, "train") - def get_dev_examples(self, data_dir, only_first=None): + def get_dev_examples(self, data_dir): """See base class.""" if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "dev", only_first) + return self._create_examples(input_data, "dev") def get_labels(self): """See base class.""" return ["0", "1"] - def _create_examples(self, input_data, set_type, only_first=None): + def _create_examples(self, input_data, set_type): """Creates examples for the training and dev sets.""" is_training = set_type == "train" @@ -363,9 +363,6 @@ class SquadProcessor(DataProcessor): ) examples.append(example) - - if only_first is not None and len(examples) > only_first: - return examples return examples class SquadV1Processor(SquadProcessor): From 7a03519975e4f0b6698bf1221c2263ed0f8d795c Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 17:24:35 -0500 Subject: [PATCH 22/26] Documentation --- docs/source/main_classes/processors.rst | 79 +++++++++++++++++- transformers/data/processors/squad.py | 104 ++++++++++++++++++++---- 2 files changed, 164 insertions(+), 19 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index a85c126956..ce0eeb553a 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -55,4 +55,81 @@ Example usage ^^^^^^^^^^^^^^^^^^^^^^^^^ An example using these processors is given in the -`run_glue.py `__ script. \ No newline at end of file +`run_glue.py `__ script. + + + +SQuAD +~~~~~~~~~~~~~~~~~~~~~ + +`The Stanford Question Answering Dataset (SQuAD) `__ is a benchmark that evaluates +the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper +`SQuAD: 100,000+ Questions for Machine Comprehension of Text `__. The second version (v2.0) was released alongside +the paper `Know What You Don't Know: Unanswerable Questions for SQuAD `__. + +This library hosts a processor for each of the two versions: + +Processors +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Those processors are: + - :class:`~transformers.data.processors.utils.SquadV1Processor` + - :class:`~transformers.data.processors.utils.SquadV2Processor` + +They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor` + +.. autoclass:: transformers.data.processors.squad.SquadProcessor + :members: + +Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures` +that can be used as model inputs. + +.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features + +These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package. +Examples are given below. + +Example usage +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Here is an example using the processors as well as the conversion method using data files: + +Example:: + + # Loading a V2 processor + processor = SquadV2Processor() + examples = processor.get_dev_examples(squad_v2_data_dir) + + # Loading a V1 processor + processor = SquadV1Processor() + examples = processor.get_dev_examples(squad_v1_data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=args.doc_stride, + max_query_length=max_query_length, + is_training=not evaluate, + ) + +Using `tensorflow_datasets` is as easy as using a data file: + +Example:: + + # tensorflow_datasets only handle Squad V1. + tfds_examples = tfds.load("squad") + examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=args.doc_stride, + max_query_length=max_query_length, + is_training=not evaluate, + ) + + +Another example using these processors is given in the +`run_squad.py `__ script. \ No newline at end of file diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 09a79db471..b17e626c98 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -74,7 +74,35 @@ def _is_whitespace(c): def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): - """Loads a data file into a list of `InputBatch`s.""" + """ + Converts a list of examples into a list of features that can be directly given as input to a model. + It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. + + Args: + examples: list of :class:`~transformers.data.processors.squad.SquadExample` + tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` + max_seq_length: The maximum sequence length of the inputs. + doc_stride: The stride used when the context is too large and is split across several features. + max_query_length: The maximum length of the query. + is_training: wheter to create features for model evaluation or model training. + + Returns: + list of :class:`~transformers.data.processors.squad.SquadFeatures` + + Example:: + + processor = SquadV2Processor() + examples = processor.get_dev_examples(data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) + """ # Defining helper methods unique_id = 1000000000 @@ -240,12 +268,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, class SquadProcessor(DataProcessor): - """Processor for the SQuAD data set.""" + """ + Processor for the SQuAD data set. + Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. + """ train_file = None dev_file = None - def get_example_from_tensor_dict(self, tensor_dict, evaluate=False): - + def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): if not evaluate: answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') answer_start = tensor_dict['answers']['answer_start'][0].numpy() @@ -296,35 +326,44 @@ class SquadProcessor(DataProcessor): examples = [] for tensor_dict in tqdm(dataset): - examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) + examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples - def get_train_examples(self, data_dir): - """See base class.""" + def get_train_examples(self, data_dir, filename=None): + """ + Returns the training examples from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the training file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + + """ if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") - def get_dev_examples(self, data_dir): - """See base class.""" + def get_dev_examples(self, data_dir, filename=None): + """ + Returns the evaluation example from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the evaluation file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + """ if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") - def get_labels(self): - """See base class.""" - return ["0", "1"] - def _create_examples(self, input_data, set_type): - """Creates examples for the training and dev sets.""" - is_training = set_type == "train" examples = [] for entry in tqdm(input_data): @@ -378,6 +417,16 @@ class SquadV2Processor(SquadProcessor): class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. + + Args: + qas_id: The example's unique identifier + question_text: The question string + context_text: The context string + answer_text: The answer string + start_position_character: The character position of the start of the answer + title: The title of the example + answers: None by default, this is used during evaluation. Holds answers as well as their start positions. + is_impossible: False by default, set to True if the example has no possible answer. """ def __init__(self, @@ -427,7 +476,26 @@ class SquadExample(object): class SquadFeatures(object): """ Single squad example features to be fed to a model. - Those features are model-specific. + Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample` + using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + cls_index: the index of the CLS token. + p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. + Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer + example_index: the index of the example + unique_id: The unique Feature identifier + paragraph_len: The length of the context + token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object. + If a token does not have their maximum context in this feature object, it means that another feature object + has more information related to that token and should be prioritized over this feature for that token. + tokens: list of tokens corresponding to the input ids + token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. + start_position: start of the answer token index + end_position: end of the answer token index """ def __init__(self, From ce158a076f7089bf11d44e1581f5bcab4dcc5396 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 17:55:52 -0500 Subject: [PATCH 23/26] Return dataset (pytorch) --- transformers/data/processors/squad.py | 41 ++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index b17e626c98..338bae0c51 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -7,7 +7,11 @@ import numpy as np from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures -from ...file_utils import is_tf_available +from ...file_utils import is_tf_available, is_torch_available + +if is_torch_available: + import torch + from torch.utils.data import TensorDataset if is_tf_available(): import tensorflow as tf @@ -73,7 +77,8 @@ def _is_whitespace(c): return False def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training): + doc_stride, max_query_length, is_training, + return_dataset=False): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. @@ -84,7 +89,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, max_seq_length: The maximum sequence length of the inputs. doc_stride: The stride used when the context is too large and is split across several features. max_query_length: The maximum length of the query. - is_training: wheter to create features for model evaluation or model training. + is_training: whether to create features for model evaluation or model training. + return_dataset: Default False. Either 'pt' or 'tf'. + if 'pt': returns a torch.data.TensorDataset, + if 'tf': returns a tf.data.Dataset Returns: list of :class:`~transformers.data.processors.squad.SquadFeatures` @@ -264,6 +272,31 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id += 1 + if return_dataset == 'pt': + if not is_torch_available(): + raise ImportError("Pytorch must be installed to return a pytorch dataset.") + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) + all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) + + if not is_training: + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_example_index, all_cls_index, all_p_mask) + else: + all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_start_positions, all_end_positions, + all_cls_index, all_p_mask) + + return features, dataset + + return features @@ -359,7 +392,7 @@ class SquadProcessor(DataProcessor): if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") From 9ecd83dace3961eaa161405814b00ea595c86451 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 5 Dec 2019 14:44:57 -0500 Subject: [PATCH 24/26] Patch evaluation for impossible values + cleanup --- docs/source/main_classes/processors.rst | 4 ++-- examples/run_squad.py | 25 +++++-------------------- transformers/data/processors/squad.py | 6 +++--- transformers/tokenization_utils.py | 2 +- 4 files changed, 11 insertions(+), 26 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index ce0eeb553a..e98910ae1b 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -55,7 +55,7 @@ Example usage ^^^^^^^^^^^^^^^^^^^^^^^^^ An example using these processors is given in the -`run_glue.py `__ script. +`run_glue.py `__ script. @@ -132,4 +132,4 @@ Example:: Another example using these processors is given in the -`run_squad.py `__ script. \ No newline at end of file +`run_squad.py `__ script. \ No newline at end of file diff --git a/examples/run_squad.py b/examples/run_squad.py index 3f1b6a798f..5caff9ae4f 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -311,7 +311,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal str(args.max_seq_length))) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) + features_and_dataset = torch.load(cached_features_file) + features, dataset = features_and_dataset["features"], features_and_dataset["dataset"] else: logger.info("Creating features from dataset file at %s", input_dir) @@ -330,40 +331,24 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = squad_convert_examples_to_features( + features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, + return_dataset='pt' ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) + torch.save({"features": features, "dataset": dataset}, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) - all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) - if evaluate: - all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) - else: - all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) - all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) - if output_examples: return dataset, examples, features return dataset diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 338bae0c51..bb56aa792f 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -312,7 +312,7 @@ class SquadProcessor(DataProcessor): if not evaluate: answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') answer_start = tensor_dict['answers']['answer_start'][0].numpy() - answers = None + answers = [] else: answers = [{ "answer_start": start.numpy(), @@ -408,7 +408,7 @@ class SquadProcessor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None - answers = None + answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] @@ -469,7 +469,7 @@ class SquadExample(object): answer_text, start_position_character, title, - answers=None, + answers=[], is_impossible=False): self.qas_id = qas_id self.question_text = question_text diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 41a611ea49..5ec173bbf6 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -194,7 +194,7 @@ class PreTrainedTokenizer(object): @property def pad_token_type_id(self): - """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + """ Id of the padding token type in the vocabulary.""" return self._pad_token_type_id @property From e9217da5ff711cf84d150b35d3f8a5c17f1641f7 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 5 Dec 2019 16:01:51 -0500 Subject: [PATCH 25/26] Cleanup Improve global visibility on the run_squad script, remove unused files and fixes related to XLNet. --- examples/run_squad.py | 69 +- examples/utils_squad.py | 1017 -------------------- examples/utils_squad_evaluate.py | 330 ------- transformers/data/metrics/squad_metrics.py | 14 +- transformers/data/processors/squad.py | 2 +- 5 files changed, 45 insertions(+), 1387 deletions(-) delete mode 100644 examples/utils_squad.py delete mode 100644 examples/utils_squad_evaluate.py diff --git a/examples/run_squad.py b/examples/run_squad.py index 5caff9ae4f..6d32211c0c 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -27,8 +27,7 @@ import glob import timeit import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from torch.utils.data.distributed import DistributedSampler try: @@ -48,14 +47,6 @@ from transformers import (WEIGHTS_NAME, BertConfig, from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features -from utils_squad import (convert_examples_to_features as old_convert, read_squad_examples as old_read, RawResult, write_predictions, - RawResultExtended, write_predictions_extended) - -# The follwing import is the official SQuAD evaluation script (2.0). -# You can remove it from the dependencies if you are using this script outside of the library -# We've added it here for automated tests (see examples/test_examples.py file) -from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad - logger = logging.getLogger(__name__) ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ @@ -98,14 +89,16 @@ def train(args, train_dataset, model, tokenizer): optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) @@ -133,20 +126,26 @@ def train(args, train_dataset, model, tokenizer): model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) + for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'start_positions': batch[3], - 'end_positions': batch[4]} + + inputs = { + 'input_ids': batch[0], + 'attention_mask': batch[1], + 'start_positions': batch[3], + 'end_positions': batch[4] + } + if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] + if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[5], - 'p_mask': batch[6]}) + inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) + outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) @@ -173,8 +172,8 @@ def train(args, train_dataset, model, tokenizer): model.zero_grad() global_step += 1 + # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): @@ -183,8 +182,8 @@ def train(args, train_dataset, model, tokenizer): tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss + # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -213,6 +212,7 @@ def evaluate(args, model, tokenizer, prefix=""): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) @@ -225,11 +225,14 @@ def evaluate(args, model, tokenizer, prefix=""): logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) + all_results = [] start_time = timeit.default_timer() + for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) + with torch.no_grad(): inputs = { 'input_ids': batch[0], @@ -238,10 +241,13 @@ def evaluate(args, model, tokenizer, prefix=""): if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids + example_indices = batch[3] + + # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], - 'p_mask': batch[5]}) + inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) + outputs = model(**inputs) for i, example_index in enumerate(example_indices): @@ -250,11 +256,13 @@ def evaluate(args, model, tokenizer, prefix=""): output = [to_list(output[i]) for output in outputs] + # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" + # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] - end_top_index = output[3], + end_top_index = output[3] cls_logits = output[4] result = SquadResult( @@ -278,16 +286,17 @@ def evaluate(args, model, tokenizer, prefix=""): # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) + if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None + # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: - # XLNet uses a more complex post-processing procedure predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.predict_file, + output_nbest_file, output_null_log_odds_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: @@ -296,6 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""): output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) + # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results @@ -308,7 +318,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length))) + str(args.max_seq_length)) + ) + + # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) @@ -341,7 +354,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal return_dataset='pt' ) - if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset}, cached_features_file) @@ -452,6 +464,11 @@ def main(): parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() + args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format( + list(filter(None, args.model_name_or_path.split('/'))).pop(), + str(args.max_seq_length)) + ) + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) diff --git a/examples/utils_squad.py b/examples/utils_squad.py deleted file mode 100644 index 4f1c581588..0000000000 --- a/examples/utils_squad.py +++ /dev/null @@ -1,1017 +0,0 @@ - -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Load SQuAD dataset. """ - -from __future__ import absolute_import, division, print_function - -import json -import logging -import math -import collections -from io import open -from tqdm import tqdm - -from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize - -# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) -from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores - -logger = logging.getLogger(__name__) - - -class SquadExample(object): - """ - A single training/test example for the Squad dataset. - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.end_position: - s += ", end_position: %d" % (self.end_position) - if self.is_impossible: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.cls_index = cls_index - self.p_mask = p_mask - self.paragraph_len = paragraph_len - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - -def read_squad_examples(input_file, is_training, version_2_with_negative): - """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r", encoding='utf-8') as reader: - input_data = json.load(reader)["data"] - - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in paragraph_text: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - end_position = None - orig_answer_text = None - is_impossible = False - if is_training: - if version_2_with_negative: - is_impossible = qa["is_impossible"] - if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") - if not is_impossible: - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - # Only add answers where the text can be exactly recovered from the - # document. If this CAN'T happen it's likely due to weird Unicode - # stuff so we will just skip the example. - # - # Note that this means for training mode, every example is NOT - # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) - continue - else: - start_position = -1 - end_position = -1 - orig_answer_text = "" - - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_position, - end_position=end_position, - is_impossible=is_impossible) - examples.append(example) - return examples - - -def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - cls_token_at_end=False, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, - sequence_a_is_doc=False): - """Loads a data file into a list of `InputBatch`s.""" - - unique_id = 1000000000 - # cnt_pos, cnt_neg = 0, 0 - # max_N, max_M = 1024, 1024 - # f = np.zeros((max_N, max_M), dtype=np.float32) - - features = [] - for (example_index, example) in enumerate(tqdm(examples)): - - # if example_index % 100 == 0: - # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) - - query_tokens = tokenizer.tokenize(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - tok_start_position = None - tok_end_position = None - if is_training and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - assert max_tokens_for_doc > 0 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - - # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = [] - - # CLS token at the beginning - if not cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = 0 - - # XLNet: P SEP Q SEP CLS - # Others: CLS Q SEP P SEP - if not sequence_a_is_doc: - # Query - tokens += query_tokens - segment_ids += [sequence_a_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - # Paragraph - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - if not sequence_a_is_doc: - segment_ids.append(sequence_b_segment_id) - else: - segment_ids.append(sequence_a_segment_id) - p_mask.append(0) - paragraph_len = doc_span.length - - if sequence_a_is_doc: - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - tokens += query_tokens - segment_ids += [sequence_b_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_b_segment_id) - p_mask.append(1) - - # CLS token at the end - if cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = len(tokens) - 1 # Index of classification token - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(pad_token) - input_mask.append(0 if mask_padding_with_zero else 1) - segment_ids.append(pad_token_segment_id) - p_mask.append(1) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - span_is_impossible = example.is_impossible - start_position = None - end_position = None - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - span_is_impossible = True - else: - if sequence_a_is_doc: - doc_offset = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - if is_training and span_is_impossible: - start_position = cls_index - end_position = cls_index - - if example_index < 20: - logger.info("*** Example ***") - logger.info("unique_id: %s" % (unique_id)) - logger.info("example_index: %s" % (example_index)) - logger.info("doc_span_index: %s" % (doc_span_index)) - logger.info("tokens: %s" % " ".join(tokens)) - logger.info("token_to_orig_map: %s" % " ".join([ - "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - logger.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - ])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - if is_training and span_is_impossible: - logger.info("impossible example") - if is_training and not span_is_impossible: - answer_text = " ".join(tokens[start_position:(end_position + 1)]) - logger.info("start_position: %d" % (start_position)) - logger.info("end_position: %d" % (end_position)) - logger.info( - "answer: %s" % (answer_text)) - - features.append( - InputFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - cls_index=cls_index, - p_mask=p_mask, - paragraph_len=paragraph_len, - start_position=start_position, - end_position=end_position, - is_impossible=span_is_impossible)) - unique_id += 1 - - return features - - -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): - """Returns tokenized answer spans that better match the annotated answer.""" - - # The SQuAD annotations are character based. We first project them to - # whitespace-tokenized words. But then after WordPiece tokenization, we can - # often find a "better match". For example: - # - # Question: What year was John Smith born? - # Context: The leader was John Smith (1895-1943). - # Answer: 1895 - # - # The original whitespace-tokenized answer will be "(1895-1943).". However - # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match - # the exact answer, 1895. - # - # However, this is not always possible. Consider the following: - # - # Question: What country is the top exporter of electornics? - # Context: The Japanese electronics industry is the lagest in the world. - # Answer: Japan - # - # In this case, the annotator chose "Japan" as a character sub-span of - # the word "Japanese". Since our WordPiece tokenizer does not split - # "Japanese", we just use "Japanese" as the annotation. This is fairly rare - # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - - # Because of the sliding window approach taken to scoring documents, a single - # token can appear in multiple documents. E.g. - # Doc: the man went to the store and bought a gallon of milk - # Span A: the man went to the - # Span B: to the store and bought - # Span C: and bought a gallon of - # ... - # - # Now the word 'bought' will have two scores from spans B and C. We only - # want to consider the score with "maximum context", which we define as - # the *minimum* of its left and right context (the *sum* of left and - # right context will always be the same, of course). - # - # In the example the maximum context for 'bought' would be span C since - # it has 1 left context and 3 right context, while span B has 4 left context - # and 0 right context. - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) - -def write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): - """Write final predictions to the json file and log-odds of null if needed.""" - logger.info("Writing predictions to: %s" % (output_prediction_file)) - logger.info("Writing nbest to: %s" % (output_nbest_file)) - - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) - - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result - - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) - - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() - - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - min_null_feature_index = 0 # the paragraph slice with min null score - null_start_logit = 0 # the start logit at the slice with min null score - null_end_logit = 0 # the end logit at the slice with min null score - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] - start_indexes = _get_best_indexes(result.start_logits, n_best_size) - end_indexes = _get_best_indexes(result.end_logits, n_best_size) - # if we could have irrelevant answers, get the min score of irrelevant - if version_2_with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[0] - if feature_null_score < score_null: - score_null = feature_null_score - min_null_feature_index = feature_index - null_start_logit = result.start_logits[0] - null_end_logit = result.end_logits[0] - for start_index in start_indexes: - for end_index in end_indexes: - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= len(feature.tokens): - continue - if end_index >= len(feature.tokens): - continue - if start_index not in feature.token_to_orig_map: - continue - if end_index not in feature.token_to_orig_map: - continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) - if version_2_with_negative: - prelim_predictions.append( - _PrelimPrediction( - feature_index=min_null_feature_index, - start_index=0, - end_index=0, - start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) - - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) - - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = " ".join(tok_tokens) - - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) - - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) - if final_text in seen_predictions: - continue - - seen_predictions[final_text] = True - else: - final_text = "" - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) - # if we didn't include the empty option in the n-best, include it - if version_2_with_negative: - if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) - - # In very rare edge cases we could only have single null prediction. - # So we just create a nonce prediction in this case to avoid failure. - if len(nbest)==1: - nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - - assert len(nbest) >= 1 - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) - if not best_non_null_entry: - if entry.text: - best_non_null_entry = entry - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_json.append(output) - - assert len(nbest_json) >= 1 - - if not version_2_with_negative: - all_predictions[example.qas_id] = nbest_json[0]["text"] - else: - # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) - scores_diff_json[example.qas_id] = score_diff - if score_diff > null_score_diff_threshold: - all_predictions[example.qas_id] = "" - else: - all_predictions[example.qas_id] = best_non_null_entry.text - all_nbest_json[example.qas_id] = nbest_json - - with open(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - - with open(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - - if version_2_with_negative: - with open(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - return all_predictions - - -# For XLNet (and XLM which uses the same head) -RawResultExtended = collections.namedtuple("RawResultExtended", - ["unique_id", "start_top_log_probs", "start_top_index", - "end_top_log_probs", "end_top_index", "cls_logits"]) - - -def write_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, output_prediction_file, - output_nbest_file, - output_null_log_odds_file, orig_data_file, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): - """ XLNet write prediction logic (more complex than Bert's). - Write final predictions to the json file and log-odds of null if needed. - - Requires utils_squad_evaluate.py - """ - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) - - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) - - logger.info("Writing predictions to: %s", output_prediction_file) - # logger.info("Writing nbest to: %s" % (output_nbest_file)) - - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) - - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result - - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() - - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] - - cur_null_score = result.cls_logits - - # if we could have irrelevant answers, get the min score of irrelevant - score_null = min(score_null, cur_null_score) - - for i in range(start_n_top): - for j in range(end_n_top): - start_log_prob = result.start_top_log_probs[i] - start_index = result.start_top_index[i] - - j_index = i * end_n_top + j - - end_log_prob = result.end_top_log_probs[j_index] - end_index = result.end_top_index[j_index] - - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= feature.paragraph_len - 1: - continue - if end_index >= feature.paragraph_len - 1: - continue - - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_log_prob=start_log_prob, - end_log_prob=end_log_prob)) - - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True) - - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - - # XLNet un-tokenizer - # Let's keep it simple for now and see if we need all this later. - # - # tok_start_to_orig_index = feature.tok_start_to_orig_index - # tok_end_to_orig_index = feature.tok_end_to_orig_index - # start_orig_pos = tok_start_to_orig_index[pred.start_index] - # end_orig_pos = tok_end_to_orig_index[pred.end_index] - # paragraph_text = example.paragraph_text - # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() - - # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = tokenizer.convert_tokens_to_string(tok_tokens) - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) - - final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, - verbose_logging) - - if final_text in seen_predictions: - continue - - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob)) - - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, - end_log_prob=-1e6)) - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_log_prob + entry.end_log_prob) - if not best_non_null_entry: - best_non_null_entry = entry - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_log_prob"] = entry.start_log_prob - output["end_log_prob"] = entry.end_log_prob - nbest_json.append(output) - - assert len(nbest_json) >= 1 - assert best_non_null_entry is not None - - score_diff = score_null - scores_diff_json[example.qas_id] = score_diff - # note(zhiliny): always predict best_non_null_entry - # and the evaluation script will search for the best threshold - all_predictions[example.qas_id] = best_non_null_entry.text - - all_nbest_json[example.qas_id] = nbest_json - - with open(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - - with open(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - - if version_2_with_negative: - with open(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - with open(orig_data_file, "r", encoding='utf-8') as reader: - orig_data = json.load(reader)["data"] - - qid_to_has_ans = make_qid_to_has_ans(orig_data) - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) - out_eval = {} - - find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) - - return out_eval - - -def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): - """Project the tokenized prediction back to the original text.""" - - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the SQuAD eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heuristic between - # `pred_text` and `orig_text` to get a character-to-character alignment. This - # can fail in certain cases in which case we just return `orig_text`. - - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": - continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) - - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - - tok_text = " ".join(tokenizer.tokenize(orig_text)) - - start_position = tok_text.find(pred_text) - if start_position == -1: - if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 - - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) - - if len(orig_ns_text) != len(tok_ns_text): - if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text - - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): - tok_s_to_ns_map[tok_index] = i - - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] - - if orig_start_position is None: - if verbose_logging: - logger.info("Couldn't map start position") - return orig_text - - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] - - if orig_end_position is None: - if verbose_logging: - logger.info("Couldn't map end position") - return orig_text - - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text - - -def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) - - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs diff --git a/examples/utils_squad_evaluate.py b/examples/utils_squad_evaluate.py deleted file mode 100644 index ed162e6fe6..0000000000 --- a/examples/utils_squad_evaluate.py +++ /dev/null @@ -1,330 +0,0 @@ -""" Official evaluation script for SQuAD version 2.0. - Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 - -In addition to basic functionality, we also compute additional statistics and -plot precision-recall curves if an additional na_prob.json file is provided. -This file is expected to map question ID's to the model's predicted probability -that a question is unanswerable. -""" -import argparse -import collections -import json -import numpy as np -import os -import re -import string -import sys - -class EVAL_OPTS(): - def __init__(self, data_file, pred_file, out_file="", - na_prob_file="na_prob.json", na_prob_thresh=1.0, - out_image_dir=None, verbose=False): - self.data_file = data_file - self.pred_file = pred_file - self.out_file = out_file - self.na_prob_file = na_prob_file - self.na_prob_thresh = na_prob_thresh - self.out_image_dir = out_image_dir - self.verbose = verbose - -OPTS = None - -def parse_args(): - parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') - parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') - parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') - parser.add_argument('--out-file', '-o', metavar='eval.json', - help='Write accuracy metrics to file (default is stdout).') - parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', - help='Model estimates of probability of no answer.') - parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, - help='Predict "" if no-answer probability exceeds this (default = 1.0).') - parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, - help='Save precision-recall curves to directory.') - parser.add_argument('--verbose', '-v', action='store_true') - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - return parser.parse_args() - -def make_qid_to_has_ans(dataset): - qid_to_has_ans = {} - for article in dataset: - for p in article['paragraphs']: - for qa in p['qas']: - qid_to_has_ans[qa['id']] = bool(qa['answers']) - return qid_to_has_ans - -def normalize_answer(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - def remove_articles(text): - regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) - return re.sub(regex, ' ', text) - def white_space_fix(text): - return ' '.join(text.split()) - def remove_punc(text): - exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) - def lower(text): - return text.lower() - return white_space_fix(remove_articles(remove_punc(lower(s)))) - -def get_tokens(s): - if not s: return [] - return normalize_answer(s).split() - -def compute_exact(a_gold, a_pred): - return int(normalize_answer(a_gold) == normalize_answer(a_pred)) - -def compute_f1(a_gold, a_pred): - gold_toks = get_tokens(a_gold) - pred_toks = get_tokens(a_pred) - common = collections.Counter(gold_toks) & collections.Counter(pred_toks) - num_same = sum(common.values()) - if len(gold_toks) == 0 or len(pred_toks) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(gold_toks == pred_toks) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(pred_toks) - recall = 1.0 * num_same / len(gold_toks) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - -def get_raw_scores(dataset, preds): - exact_scores = {} - f1_scores = {} - for article in dataset: - for p in article['paragraphs']: - for qa in p['qas']: - qid = qa['id'] - gold_answers = [a['text'] for a in qa['answers'] - if normalize_answer(a['text'])] - if not gold_answers: - # For unanswerable questions, only correct answer is empty string - gold_answers = [''] - if qid not in preds: - print('Missing prediction for %s' % qid) - continue - a_pred = preds[qid] - # Take max over all gold answers - exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) - f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) - return exact_scores, f1_scores - -def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): - new_scores = {} - for qid, s in scores.items(): - pred_na = na_probs[qid] > na_prob_thresh - if pred_na: - new_scores[qid] = float(not qid_to_has_ans[qid]) - else: - new_scores[qid] = s - return new_scores - -def make_eval_dict(exact_scores, f1_scores, qid_list=None): - if not qid_list: - total = len(exact_scores) - return collections.OrderedDict([ - ('exact', 100.0 * sum(exact_scores.values()) / total), - ('f1', 100.0 * sum(f1_scores.values()) / total), - ('total', total), - ]) - else: - total = len(qid_list) - return collections.OrderedDict([ - ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), - ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), - ('total', total), - ]) - -def merge_eval(main_eval, new_eval, prefix): - for k in new_eval: - main_eval['%s_%s' % (prefix, k)] = new_eval[k] - -def plot_pr_curve(precisions, recalls, out_image, title): - plt.step(recalls, precisions, color='b', alpha=0.2, where='post') - plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.xlim([0.0, 1.05]) - plt.ylim([0.0, 1.05]) - plt.title(title) - plt.savefig(out_image) - plt.clf() - -def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, - out_image=None, title=None): - qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - true_pos = 0.0 - cur_p = 1.0 - cur_r = 0.0 - precisions = [1.0] - recalls = [0.0] - avg_prec = 0.0 - for i, qid in enumerate(qid_list): - if qid_to_has_ans[qid]: - true_pos += scores[qid] - cur_p = true_pos / float(i+1) - cur_r = true_pos / float(num_true_pos) - if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: - # i.e., if we can put a threshold after this point - avg_prec += cur_p * (cur_r - recalls[-1]) - precisions.append(cur_p) - recalls.append(cur_r) - if out_image: - plot_pr_curve(precisions, recalls, out_image, title) - return {'ap': 100.0 * avg_prec} - -def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, - qid_to_has_ans, out_image_dir): - if out_image_dir and not os.path.exists(out_image_dir): - os.makedirs(out_image_dir) - num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) - if num_true_pos == 0: - return - pr_exact = make_precision_recall_eval( - exact_raw, na_probs, num_true_pos, qid_to_has_ans, - out_image=os.path.join(out_image_dir, 'pr_exact.png'), - title='Precision-Recall curve for Exact Match score') - pr_f1 = make_precision_recall_eval( - f1_raw, na_probs, num_true_pos, qid_to_has_ans, - out_image=os.path.join(out_image_dir, 'pr_f1.png'), - title='Precision-Recall curve for F1 score') - oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} - pr_oracle = make_precision_recall_eval( - oracle_scores, na_probs, num_true_pos, qid_to_has_ans, - out_image=os.path.join(out_image_dir, 'pr_oracle.png'), - title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') - merge_eval(main_eval, pr_exact, 'pr_exact') - merge_eval(main_eval, pr_f1, 'pr_f1') - merge_eval(main_eval, pr_oracle, 'pr_oracle') - -def histogram_na_prob(na_probs, qid_list, image_dir, name): - if not qid_list: - return - x = [na_probs[k] for k in qid_list] - weights = np.ones_like(x) / float(len(x)) - plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) - plt.xlabel('Model probability of no-answer') - plt.ylabel('Proportion of dataset') - plt.title('Histogram of no-answer probability: %s' % name) - plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) - plt.clf() - -def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): - num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) - cur_score = num_no_ans - best_score = cur_score - best_thresh = 0.0 - qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - for i, qid in enumerate(qid_list): - if qid not in scores: continue - if qid_to_has_ans[qid]: - diff = scores[qid] - else: - if preds[qid]: - diff = -1 - else: - diff = 0 - cur_score += diff - if cur_score > best_score: - best_score = cur_score - best_thresh = na_probs[qid] - return 100.0 * best_score / len(scores), best_thresh - -def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): - num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) - cur_score = num_no_ans - best_score = cur_score - best_thresh = 0.0 - qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - for i, qid in enumerate(qid_list): - if qid not in scores: continue - if qid_to_has_ans[qid]: - diff = scores[qid] - else: - if preds[qid]: - diff = -1 - else: - diff = 0 - cur_score += diff - if cur_score > best_score: - best_score = cur_score - best_thresh = na_probs[qid] - - has_ans_score, has_ans_cnt = 0, 0 - for qid in qid_list: - if not qid_to_has_ans[qid]: continue - has_ans_cnt += 1 - - if qid not in scores: continue - has_ans_score += scores[qid] - - return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt - -def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): - best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) - best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh - -def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): - best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) - best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh - main_eval['has_ans_exact'] = has_ans_exact - main_eval['has_ans_f1'] = has_ans_f1 - -def main(OPTS): - with open(OPTS.data_file) as f: - dataset_json = json.load(f) - dataset = dataset_json['data'] - with open(OPTS.pred_file) as f: - preds = json.load(f) - if OPTS.na_prob_file: - with open(OPTS.na_prob_file) as f: - na_probs = json.load(f) - else: - na_probs = {k: 0.0 for k in preds} - qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(dataset, preds) - exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, - OPTS.na_prob_thresh) - f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, - OPTS.na_prob_thresh) - out_eval = make_eval_dict(exact_thresh, f1_thresh) - if has_ans_qids: - has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) - merge_eval(out_eval, has_ans_eval, 'HasAns') - if no_ans_qids: - no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) - merge_eval(out_eval, no_ans_eval, 'NoAns') - if OPTS.na_prob_file: - find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) - if OPTS.na_prob_file and OPTS.out_image_dir: - run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, - qid_to_has_ans, OPTS.out_image_dir) - histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns') - histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns') - if OPTS.out_file: - with open(OPTS.out_file, 'w') as f: - json.dump(out_eval, f) - else: - print(json.dumps(out_eval, indent=2)) - return out_eval - -if __name__ == '__main__': - OPTS = parse_args() - if OPTS.out_image_dir: - import matplotlib - matplotlib.use('Agg') - import matplotlib.pyplot as plt - main(OPTS) diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index f8449df045..0755c0ab7a 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -578,7 +578,6 @@ def compute_predictions_log_probs( output_prediction_file, output_nbest_file, output_null_log_odds_file, - orig_data_file, start_n_top, end_n_top, version_2_with_negative, @@ -756,15 +755,4 @@ def compute_predictions_log_probs( with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - with open(orig_data_file, "r", encoding='utf-8') as reader: - orig_data = json.load(reader)["data"] - - qid_to_has_ans = make_qid_to_has_ans(orig_data) - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) - out_eval = {} - - find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) - - return out_eval + return all_predictions diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index bb56aa792f..3d7f832540 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -9,7 +9,7 @@ from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures from ...file_utils import is_tf_available, is_torch_available -if is_torch_available: +if is_torch_available(): import torch from torch.utils.data import TensorDataset From 2a4ef098d65939d436e2a5efbb518fb807b6b1b6 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 9 Dec 2019 10:46:47 -0500 Subject: [PATCH 26/26] Add ALBERT and XLM to SQuAD script --- examples/run_squad.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index a8ac1d1b05..2df29014ef 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -44,7 +44,9 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetForQuestionAnswering, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer, - AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) + AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer, + XLMConfig, XLMForQuestionAnswering, XLMTokenizer, + ) from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features @@ -58,7 +60,8 @@ MODEL_CLASSES = { 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) + 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), + 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer) } def set_seed(args):