From 705d65368fb28246534ef636fe62c008f4fb2682 Mon Sep 17 00:00:00 2001 From: Zachary Mueller Date: Wed, 20 Apr 2022 17:26:27 -0400 Subject: [PATCH] Fix multiproc metrics in no_trainer examples (#16865) --- .../run_image_classification_no_trainer.py | 13 +++++++++++-- .../multiple-choice/run_swag_no_trainer.py | 13 +++++++++++-- .../run_semantic_segmentation_no_trainer.py | 15 +++++++++++++-- .../run_summarization_no_trainer.py | 18 +++++++++++++++--- .../text-classification/run_glue_no_trainer.py | 13 +++++++++++-- .../token-classification/run_ner_no_trainer.py | 12 +++++++++--- .../translation/run_translation_no_trainer.py | 9 +++++++++ 7 files changed, 79 insertions(+), 14 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 6fea55a842..40c38c6316 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -457,12 +457,21 @@ def main(): break model.eval() + samples_seen = 0 for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) + predictions, references = accelerator.gather((predictions, batch["labels"])) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + predictions = predictions[: len(eval_dataloader.dataset) - samples_seen] + references = references[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += references.shape[0] metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), + predictions=predictions, + references=references, ) eval_metric = metric.compute() diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 63b799c092..0efdc8c2ec 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -559,13 +559,22 @@ def main(): break model.eval() + samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) + predictions, references = accelerator.gather((predictions, batch["labels"])) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + predictions = predictions[: len(eval_dataloader.dataset) - samples_seen] + references = references[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += references.shape[0] metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), + predictions=predictions, + references=references, ) eval_metric = metric.compute() diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 223a42c188..7bd3754581 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -567,6 +567,7 @@ def main(): logger.info("***** Running evaluation *****") model.eval() + samples_seen = 0 for step, batch in enumerate(tqdm(eval_dataloader, disable=not accelerator.is_local_main_process)): outputs = model(**batch) @@ -575,9 +576,19 @@ def main(): ) predictions = upsampled_logits.argmax(dim=1) + predictions, references = accelerator.gather((predictions, batch["labels"])) + + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + predictions = predictions[: len(eval_dataloader.dataset) - samples_seen] + references = references[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += references.shape[0] + metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), + predictions=predictions, + references=references, ) eval_metrics = metric.compute( diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 48c0aada34..eb18648b3e 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -628,6 +628,7 @@ def main(): "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } + samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( @@ -644,8 +645,9 @@ def main(): # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes(batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) - generated_tokens = accelerator.gather(generated_tokens).cpu().numpy() - labels = accelerator.gather(labels).cpu().numpy() + generated_tokens, labels = accelerator.gather((generated_tokens, labels)) + generated_tokens = generated_tokens.cpu().numpy() + labels = labels.cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. @@ -656,8 +658,18 @@ def main(): decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + decoded_preds = decoded_preds[: len(eval_dataloader.dataset) - samples_seen] + decoded_labels = decoded_labels[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += decoded_labels.shape[0] - metric.add_batch(predictions=decoded_preds, references=decoded_labels) + metric.add_batch( + predictions=decoded_preds, + references=decoded_labels, + ) result = metric.compute(use_stemmer=True) # Extract a few results from ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 0842b462f9..b0b6c9ce52 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -506,12 +506,21 @@ def main(): break model.eval() + samples_seen = 0 for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze() + predictions, references = accelerator.gather((predictions, batch["labels"])) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + predictions = predictions[: len(eval_dataloader.dataset) - samples_seen] + references = references[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += references.shape[0] metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), + predictions=predictions, + references=references, ) eval_metric = metric.compute() diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 6351d26256..735ec5bd62 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -658,6 +658,7 @@ def main(): break model.eval() + samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) @@ -666,9 +667,14 @@ def main(): if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) - - predictions_gathered = accelerator.gather(predictions) - labels_gathered = accelerator.gather(labels) + predictions_gathered, labels_gathered = accelerator.gather((predictions, labels)) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + predictions_gathered = predictions_gathered[: len(eval_dataloader.dataset) - samples_seen] + labels_gathered = labels_gathered[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += labels_gathered.shape[0] preds, refs = get_labels(predictions_gathered, labels_gathered) metric.add_batch( predictions=preds, diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 706d7637fd..9df81d65f6 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -613,6 +613,7 @@ def main(): "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } + samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( @@ -641,6 +642,14 @@ def main(): decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader): + decoded_preds = decoded_preds[: len(eval_dataloader.dataset) - samples_seen] + decoded_labels = decoded_labels[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += decoded_labels.shape[0] + metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metric = metric.compute() logger.info({"bleu": eval_metric["score"]})