Distributed eval: SequentialDistributedSampler + gather all results (#4243)

* Distributed eval: SequentialDistributedSampler + gather all results * For consistency only write to disk from world_master Close https://github.com/huggingface/transformers/issues/4272 * Working distributed eval * Hook into scripts * Fix #3721 again * TPU.mesh_reduce: stay in tensor space Thanks @jysohn23 * Just a small comment * whitespace * torch.hub: pip install packaging * Add test scenarii
2020-05-18 22:02:39 -04:00
parent 4c06893610
commit 5e7fe8b585
7 changed files with 280 additions and 83 deletions
--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@@ -251,7 +251,7 @@ def main():

    # Evaluation
    results = {}
-    if training_args.do_eval and training_args.local_rank in [-1, 0]:
+    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()
@@ -260,11 +260,12 @@ def main():
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)