diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index c293038085..e152cd9911 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -508,8 +508,12 @@ def main(): # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + labels = p.label_ids + if not training_args.eval_do_concat_batches: + preds = np.concatenate(preds, axis=0) + labels = np.concatenate(p.label_ids, axis=0) preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - result = metric.compute(predictions=preds, references=p.label_ids) + result = metric.compute(predictions=preds, references=labels) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index bbe85cff2e..28c344de27 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -529,6 +529,9 @@ def main(): def compute_metrics(p): predictions, labels = p + if not training_args.eval_do_concat_batches: + predictions = np.hstack(predictions) + labels = np.hstack(labels) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens)