Improve notrainer examples (#17449)

* improve no-trainer examples * Trigger CI * adding comment to clarify tracker init on main process * Trigger CI * Trigger CI * Trigger CI
2022-05-28 00:06:31 +05:30
parent 7999ec125f
commit d156898f3b
10 changed files with 310 additions and 118 deletions
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -45,7 +45,6 @@ from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
-    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
@@ -94,7 +93,7 @@ def parse_args():
        "--model_name_or_path",
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
+        required=False,
    )
    parser.add_argument(
        "--config_name",
@@ -206,7 +205,17 @@ def parse_args():
    parser.add_argument(
        "--with_tracking",
        action="store_true",
-        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
    )
    args = parser.parse_args()

@@ -231,8 +240,11 @@ def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
-    accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator = (
+        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    )
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -451,7 +463,7 @@ def main():
            "weight_decay": 0.0,
        },
    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
    if accelerator.distributed_type == DistributedType.TPU:
@@ -488,12 +500,15 @@ def main():
    else:
        checkpointing_steps = None

-    # We need to initialize the trackers we use, and also store our configuration
+    # We need to initialize the trackers we use, and also store our configuration.
+    # We initialize the trackers only on main process because `accelerator.log`
+    # only logs on main process and we don't want empty logs/runs on other processes.
    if args.with_tracking:
-        experiment_config = vars(args)
-        # TensorBoard cannot log Enums, need the raw value
-        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
-        accelerator.init_trackers("clm_no_trainer", experiment_config)
+        if accelerator.is_main_process:
+            experiment_config = vars(args)
+            # TensorBoard cannot log Enums, need the raw value
+            experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+            accelerator.init_trackers("clm_no_trainer", experiment_config)

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -577,15 +592,23 @@ def main():
        losses = torch.cat(losses)
        losses = losses[: len(eval_dataset)]
        try:
-            perplexity = math.exp(torch.mean(losses))
+            eval_loss = torch.mean(losses)
+            perplexity = math.exp(eval_loss)
        except OverflowError:
            perplexity = float("inf")

-        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")

        if args.with_tracking:
            accelerator.log(
-                {"perplexity": perplexity, "train_loss": total_loss, "epoch": epoch, "step": completed_steps},
+                {
+                    "perplexity": perplexity,
+                    "eval_loss": eval_loss,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
            )

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -45,7 +45,6 @@ from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
-    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
@@ -97,7 +96,7 @@ def parse_args():
        "--model_name_or_path",
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
+        required=False,
    )
    parser.add_argument(
        "--config_name",
@@ -213,7 +212,17 @@ def parse_args():
    parser.add_argument(
        "--with_tracking",
        action="store_true",
-        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
    )
    args = parser.parse_args()

@@ -240,8 +249,11 @@ def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
-    accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator = (
+        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    )
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -492,7 +504,7 @@ def main():
            "weight_decay": 0.0,
        },
    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
    if accelerator.distributed_type == DistributedType.TPU:
@@ -532,12 +544,15 @@ def main():
    else:
        checkpointing_steps = None

-    # We need to initialize the trackers we use, and also store our configuration
+    # We need to initialize the trackers we use, and also store our configuration.
+    # We initialize the trackers only on main process because `accelerator.log`
+    # only logs on main process and we don't want empty logs/runs on other processes.
    if args.with_tracking:
-        experiment_config = vars(args)
-        # TensorBoard cannot log Enums, need the raw value
-        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
-        accelerator.init_trackers("mlm_no_trainer", experiment_config)
+        if accelerator.is_main_process:
+            experiment_config = vars(args)
+            # TensorBoard cannot log Enums, need the raw value
+            experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+            accelerator.init_trackers("mlm_no_trainer", experiment_config)

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -622,7 +637,8 @@ def main():
        losses = torch.cat(losses)
        losses = losses[: len(eval_dataset)]
        try:
-            perplexity = math.exp(torch.mean(losses))
+            eval_loss = torch.mean(losses)
+            perplexity = math.exp(eval_loss)
        except OverflowError:
            perplexity = float("inf")

@@ -630,7 +646,14 @@ def main():

        if args.with_tracking:
            accelerator.log(
-                {"perplexity": perplexity, "train_loss": total_loss, "epoch": epoch, "step": completed_steps},
+                {
+                    "perplexity": perplexity,
+                    "eval_loss": eval_loss,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
            )

        if args.push_to_hub and epoch < args.num_train_epochs - 1: