From 4ed0fa3676ad8900eaa982a6c5c2ad6b75c8ea46 Mon Sep 17 00:00:00 2001
From: FilipposVentirozos
 <58438004+FilipposVentirozos@users.noreply.github.com>
Date: Wed, 12 Oct 2022 13:33:44 +0100
Subject: [PATCH] Fix pytorch seq2seq qa (#19258)

* fixed typo for SQuAD

* Fixed the preprocess_validation_function function for the labels to reflect the remaining truncated instances

* Rolled back the trainer_seq2seq_qa.py for UnboundLocalError: local variable 'metrics' referenced before assignment

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 examples/pytorch/question-answering/README.md |  2 +-
 .../question-answering/run_seq2seq_qa.py      | 33 ++++++++++---------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/examples/pytorch/question-answering/README.md b/examples/pytorch/question-answering/README.md
index f6e660e972..6b86a4effa 100644
--- a/examples/pytorch/question-answering/README.md
+++ b/examples/pytorch/question-answering/README.md
@@ -115,7 +115,7 @@ python run_seq2seq_qa.py \
   --dataset_name squad_v2 \
   --context_column context \
   --question_column question \
-  --answer_column answer \
+  --answer_column answers \
   --do_train \
   --do_eval \
   --per_device_train_batch_size 12 \
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index a889870dab..0bf551d17a 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -484,26 +484,12 @@ def main():
             max_length=max_seq_length,
             padding=padding,
             truncation=True,
-            return_offsets_mapping=True,
             return_overflowing_tokens=True,
+            return_offsets_mapping=True,
         )
-
         # Tokenize targets with the `text_target` keyword argument
         labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
 
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
-
-        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
-        # corresponding example_id and we will store the offset mappings.
-        model_inputs["example_id"] = []
-
-        for i in range(len(model_inputs["input_ids"])):
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            model_inputs["example_id"].append(examples["id"][sample_index])
-
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
         if padding == "max_length" and data_args.ignore_pad_token_for_loss:
@@ -511,8 +497,23 @@ def main():
                 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
             ]
 
-        model_inputs["labels"] = labels["input_ids"]
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
 
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        model_inputs["example_id"] = []
+        # Augment the overflowing tokens to the labels
+        labels_out = []
+
+        for i in range(len(model_inputs["input_ids"])):
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            model_inputs["example_id"].append(examples["id"][sample_index])
+            labels_out.append(labels["input_ids"][sample_index])
+
+        model_inputs["labels"] = labels_out
         return model_inputs
 
     if training_args.do_train: