Distributed eval: SequentialDistributedSampler + gather all results (#4243)

* Distributed eval: SequentialDistributedSampler + gather all results * For consistency only write to disk from world_master Close https://github.com/huggingface/transformers/issues/4272 * Working distributed eval * Hook into scripts * Fix #3721 again * TPU.mesh_reduce: stay in tensor space Thanks @jysohn23 * Just a small comment * whitespace * torch.hub: pip install packaging * Add test scenarii
2020-05-18 22:02:39 -04:00
parent 4c06893610
commit 5e7fe8b585
7 changed files with 280 additions and 83 deletions
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -166,7 +166,7 @@ def main():

    # Evaluation
    results = {}
-    if training_args.do_eval and training_args.local_rank in [-1, 0]:
+    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
@@ -181,11 +181,12 @@ def main():
            output_eval_file = os.path.join(
                training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
            )
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
-                for key, value in result.items():
-                    logger.info("  %s = %s", key, value)
-                    writer.write("%s = %s\n" % (key, value))
+            if trainer.is_world_master():
+                with open(output_eval_file, "w") as writer:
+                    logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
+                    for key, value in result.items():
+                        logger.info("  %s = %s", key, value)
+                        writer.write("%s = %s\n" % (key, value))

            results.update(result)