From 9ecd83dace3961eaa161405814b00ea595c86451 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 5 Dec 2019 14:44:57 -0500 Subject: [PATCH] Patch evaluation for impossible values + cleanup --- docs/source/main_classes/processors.rst | 4 ++-- examples/run_squad.py | 25 +++++-------------------- transformers/data/processors/squad.py | 6 +++--- transformers/tokenization_utils.py | 2 +- 4 files changed, 11 insertions(+), 26 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index ce0eeb553a..e98910ae1b 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -55,7 +55,7 @@ Example usage ^^^^^^^^^^^^^^^^^^^^^^^^^ An example using these processors is given in the -`run_glue.py `__ script. +`run_glue.py `__ script. @@ -132,4 +132,4 @@ Example:: Another example using these processors is given in the -`run_squad.py `__ script. \ No newline at end of file +`run_squad.py `__ script. \ No newline at end of file diff --git a/examples/run_squad.py b/examples/run_squad.py index 3f1b6a798f..5caff9ae4f 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -311,7 +311,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal str(args.max_seq_length))) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) + features_and_dataset = torch.load(cached_features_file) + features, dataset = features_and_dataset["features"], features_and_dataset["dataset"] else: logger.info("Creating features from dataset file at %s", input_dir) @@ -330,40 +331,24 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = squad_convert_examples_to_features( + features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, + return_dataset='pt' ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) + torch.save({"features": features, "dataset": dataset}, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) - all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) - if evaluate: - all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) - else: - all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) - all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) - if output_examples: return dataset, examples, features return dataset diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 338bae0c51..bb56aa792f 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -312,7 +312,7 @@ class SquadProcessor(DataProcessor): if not evaluate: answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') answer_start = tensor_dict['answers']['answer_start'][0].numpy() - answers = None + answers = [] else: answers = [{ "answer_start": start.numpy(), @@ -408,7 +408,7 @@ class SquadProcessor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None - answers = None + answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] @@ -469,7 +469,7 @@ class SquadExample(object): answer_text, start_position_character, title, - answers=None, + answers=[], is_impossible=False): self.qas_id = qas_id self.question_text = question_text diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 41a611ea49..5ec173bbf6 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -194,7 +194,7 @@ class PreTrainedTokenizer(object): @property def pad_token_type_id(self): - """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + """ Id of the padding token type in the vocabulary.""" return self._pad_token_type_id @property