Add tests for no_trainer and fix existing examples (#16656)

* Fixed some bugs involving saving during epochs * Added tests mimicking the existing examples tests * Added in json exporting to all `no_trainer` examples for consistency
2022-04-08 10:03:56 -04:00
parent ab229663b5
commit d57da99237
11 changed files with 414 additions and 22 deletions
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -23,6 +23,7 @@ https://huggingface.co/models?filter=text-generation
 # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.

 import argparse
+import json
 import logging
 import math
 import os
@@ -537,7 +538,10 @@ def main():

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)

            if completed_steps >= args.max_train_steps:
                break
@@ -581,7 +585,10 @@ def main():
                )

        if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@@ -592,6 +599,9 @@ def main():
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)

+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"perplexity": perplexity}, f)
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -23,6 +23,7 @@ https://huggingface.co/models?filter=fill-mask
 # You can also adapt this script on your own mlm task. Pointers for this are left as comments.

 import argparse
+import json
 import logging
 import math
 import os
@@ -457,9 +458,11 @@ def main():
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # Conditional for small test subsets
+    if len(train_dataset) > 3:
+        # Log a few random samples from the training set:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # Data collator
    # This one will take care of randomly masking the tokens.
@@ -581,7 +584,10 @@ def main():

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)

            if completed_steps >= args.max_train_steps:
                break
@@ -625,7 +631,10 @@ def main():
                )

        if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@@ -636,6 +645,9 @@ def main():
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)

+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"perplexity": perplexity}, f)
+

 if __name__ == "__main__":
    main()