From 4ed0fa3676ad8900eaa982a6c5c2ad6b75c8ea46 Mon Sep 17 00:00:00 2001 From: FilipposVentirozos <58438004+FilipposVentirozos@users.noreply.github.com> Date: Wed, 12 Oct 2022 13:33:44 +0100 Subject: [PATCH] Fix pytorch seq2seq qa (#19258) * fixed typo for SQuAD * Fixed the preprocess_validation_function function for the labels to reflect the remaining truncated instances * Rolled back the trainer_seq2seq_qa.py for UnboundLocalError: local variable 'metrics' referenced before assignment Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- examples/pytorch/question-answering/README.md | 2 +- .../question-answering/run_seq2seq_qa.py | 33 ++++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/examples/pytorch/question-answering/README.md b/examples/pytorch/question-answering/README.md index f6e660e972..6b86a4effa 100644 --- a/examples/pytorch/question-answering/README.md +++ b/examples/pytorch/question-answering/README.md @@ -115,7 +115,7 @@ python run_seq2seq_qa.py \ --dataset_name squad_v2 \ --context_column context \ --question_column question \ - --answer_column answer \ + --answer_column answers \ --do_train \ --do_eval \ --per_device_train_batch_size 12 \ diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index a889870dab..0bf551d17a 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -484,26 +484,12 @@ def main(): max_length=max_seq_length, padding=padding, truncation=True, - return_offsets_mapping=True, return_overflowing_tokens=True, + return_offsets_mapping=True, ) - # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True) - # Since one example might give us several features if it has a long context, we need a map from a feature to - # its corresponding example. This key gives us just that. - sample_mapping = model_inputs.pop("overflow_to_sample_mapping") - - # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the - # corresponding example_id and we will store the offset mappings. - model_inputs["example_id"] = [] - - for i in range(len(model_inputs["input_ids"])): - # One example can give several spans, this is the index of the example containing this span of text. - sample_index = sample_mapping[i] - model_inputs["example_id"].append(examples["id"][sample_index]) - # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: @@ -511,8 +497,23 @@ def main(): [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] - model_inputs["labels"] = labels["input_ids"] + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = model_inputs.pop("overflow_to_sample_mapping") + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + model_inputs["example_id"] = [] + # Augment the overflowing tokens to the labels + labels_out = [] + + for i in range(len(model_inputs["input_ids"])): + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + model_inputs["example_id"].append(examples["id"][sample_index]) + labels_out.append(labels["input_ids"][sample_index]) + + model_inputs["labels"] = labels_out return model_inputs if training_args.do_train: