From 964a1b6b7de8a83414917b5344f85d79bb0be808 Mon Sep 17 00:00:00 2001
From: jeffhataws <jthuynh@amazon.com>
Date: Tue, 22 Apr 2025 03:13:25 -0700
Subject: [PATCH] Fix ValueError when eval_do_concat_batches=False with
 examples (#37621)

https://github.com/huggingface/transformers/issues/37593

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 examples/pytorch/text-classification/run_glue.py | 6 +++++-
 examples/pytorch/token-classification/run_ner.py | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index c293038085..e152cd9911 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -508,8 +508,12 @@ def main():
     # predictions and label_ids field) and has to return a dictionary string to float.
     def compute_metrics(p: EvalPrediction):
         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        labels = p.label_ids
+        if not training_args.eval_do_concat_batches:
+            preds = np.concatenate(preds, axis=0)
+            labels = np.concatenate(p.label_ids, axis=0)
         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        result = metric.compute(predictions=preds, references=p.label_ids)
+        result = metric.compute(predictions=preds, references=labels)
         if len(result) > 1:
             result["combined_score"] = np.mean(list(result.values())).item()
         return result
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index bbe85cff2e..28c344de27 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -529,6 +529,9 @@ def main():
 
     def compute_metrics(p):
         predictions, labels = p
+        if not training_args.eval_do_concat_batches:
+            predictions = np.hstack(predictions)
+            labels = np.hstack(labels)
         predictions = np.argmax(predictions, axis=2)
 
         # Remove ignored index (special tokens)