From 1198ba8fbac8c103950f58f19f76c18b37ee99b5 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 18 Dec 2020 15:10:39 -0500 Subject: [PATCH] Add timing inside Trainer (#9196) * Add timing inside Trainer * Fix tests * Add n_objs for train * Sort logs --- examples/seq2seq/finetune_trainer.py | 41 +++++----------------------- examples/test_examples.py | 4 +-- src/transformers/trainer.py | 17 ++++++++++-- src/transformers/trainer_utils.py | 22 +++++++++++++++ src/transformers/training_args.py | 16 ++++++++--- tests/test_trainer.py | 25 +++++++++++++---- 6 files changed, 76 insertions(+), 49 deletions(-) diff --git a/examples/seq2seq/finetune_trainer.py b/examples/seq2seq/finetune_trainer.py index 578e32045c..5159b82ef3 100755 --- a/examples/seq2seq/finetune_trainer.py +++ b/examples/seq2seq/finetune_trainer.py @@ -16,7 +16,6 @@ import logging import os import sys -import time from dataclasses import dataclass, field from typing import Optional @@ -120,30 +119,6 @@ class DataTrainingArguments: ) -def speed_metrics(split, start_time, num_samples): - """ - Measure and return speed performance metrics. - - This function requires a time snapshot `start_time` before the operation to be measured starts and this - function should be run immediately after the operation to be measured has completed. - - Args: - - split: one of train, val, test - - start_time: operation start time - - num_samples: number of samples processed - - """ - runtime = time.time() - start_time - result = {} - - samples_per_second = 1 / (runtime / num_samples) - result[f"{split}_samples_per_second"] = round(samples_per_second, 3) - result[f"{split}_runtime"] = round(runtime, 4) - - result[f"{split}_n_ojbs"] = num_samples - return result - - def handle_metrics(split, metrics, output_dir): """ Log and save metrics @@ -155,8 +130,8 @@ def handle_metrics(split, metrics, output_dir): """ logger.info(f"***** {split} metrics *****") - for key, value in metrics.items(): - logger.info(f" {key} = {value}") + for key in sorted(metrics.keys()): + logger.info(f" {key} = {metrics[key]}") save_json(metrics, os.path.join(output_dir, f"{split}_results.json")) @@ -311,11 +286,11 @@ def main(): if training_args.do_train: logger.info("*** Train ***") - start_time = time.time() - trainer.train( + train_result = trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) - metrics = speed_metrics("train", start_time, data_args.n_train) + metrics = train_result.metrics + metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer @@ -334,9 +309,8 @@ def main(): if training_args.do_eval: logger.info("*** Evaluate ***") - start_time = time.time() metrics = trainer.evaluate(metric_key_prefix="val") - metrics.update(speed_metrics("val", start_time, data_args.n_val)) + metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): @@ -347,10 +321,9 @@ def main(): if training_args.do_predict: logger.info("*** Predict ***") - start_time = time.time() test_output = trainer.predict(test_dataset=test_dataset, metric_key_prefix="test") metrics = test_output.metrics - metrics.update(speed_metrics("test", start_time, data_args.n_test)) + metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) diff --git a/examples/test_examples.py b/examples/test_examples.py index e4ef9da6ae..eb0809d0bc 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -97,9 +97,7 @@ class ExamplesTests(TestCasePlus): with patch.object(sys, "argv", testargs): result = run_glue.main() - del result["eval_loss"] - for value in result.values(): - self.assertGreaterEqual(value, 0.75) + self.assertGreaterEqual(result["eval_accuracy"], 0.75) @require_torch_non_multi_gpu_but_fix_me def test_run_clm(self): diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index ee918e2674..8cbc8ea299 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -22,6 +22,7 @@ import math import os import re import shutil +import time import warnings from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -89,6 +90,7 @@ from .trainer_utils import ( default_compute_objective, default_hp_space, set_seed, + speed_metrics, ) from .training_args import TrainingArguments from .utils import logging @@ -707,6 +709,7 @@ class Trainer: logger.info(f" Total optimization steps = {max_steps}") self.state.epoch = 0 + start_time = time.time() epochs_trained = 0 steps_trained_in_current_epoch = 0 @@ -870,15 +873,17 @@ class Trainer: state_dict = torch.load(os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)) self.model.load_state_dict(state_dict) + metrics = speed_metrics("train", start_time, self.state.max_steps) if self._total_flos is not None: self.store_flos() - self.log({"total_flos": self.state.total_flos}) + metrics["total_flos"] = self.state.total_flos + self.log(metrics) self.control = self.callback_handler.on_train_end(self.args, self.state, self.control) # add remaining tr_loss self._total_loss_scalar += tr_loss.item() - return TrainOutput(self.state.global_step, self._total_loss_scalar / self.state.global_step) + return TrainOutput(self.state.global_step, self._total_loss_scalar / self.state.global_step, metrics) def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch): if self.control.should_log: @@ -1317,6 +1322,7 @@ class Trainer: raise ValueError("eval_dataset must implement __len__") eval_dataloader = self.get_eval_dataloader(eval_dataset) + start_time = time.time() output = self.prediction_loop( eval_dataloader, @@ -1328,6 +1334,8 @@ class Trainer: metric_key_prefix=metric_key_prefix, ) + n_samples = len(eval_dataset if eval_dataset is not None else self.eval_dataset) + output.metrics.update(speed_metrics(metric_key_prefix, start_time, n_samples)) self.log(output.metrics) if self.args.tpu_metrics_debug or self.args.debug: @@ -1374,10 +1382,13 @@ class Trainer: raise ValueError("test_dataset must implement __len__") test_dataloader = self.get_test_dataloader(test_dataset) + start_time = time.time() - return self.prediction_loop( + output = self.prediction_loop( test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix ) + output.metrics.update(speed_metrics(metric_key_prefix, start_time, len(test_dataset))) + return output def prediction_loop( self, diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index e2697907b9..a63959f48c 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -18,6 +18,7 @@ Utilities for the Trainer and TFTrainer class. Should be independent from PyTorc import copy import random +import time from typing import Any, Dict, NamedTuple, Optional, Tuple, Union import numpy as np @@ -70,6 +71,7 @@ class PredictionOutput(NamedTuple): class TrainOutput(NamedTuple): global_step: int training_loss: float + metrics: Dict[str, float] PREFIX_CHECKPOINT_DIR = "checkpoint" @@ -179,3 +181,23 @@ def total_processes_number(local_rank): return torch.distributed.get_world_size() return 1 + + +def speed_metrics(split, start_time, num_samples=None): + """ + Measure and return speed performance metrics. + + This function requires a time snapshot `start_time` before the operation to be measured starts and this function + should be run immediately after the operation to be measured has completed. + + Args: + - split: name to prefix metric (like train, eval, test...) + - start_time: operation start time + - num_samples: number of samples processed + """ + runtime = time.time() - start_time + result = {f"{split}_runtime": round(runtime, 4)} + if num_samples is not None: + samples_per_second = 1 / (runtime / num_samples) + result[f"{split}_samples_per_second"] = round(samples_per_second, 3) + return result diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 0a02bd2232..50a214d58e 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import dataclasses import json import os -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field from enum import Enum from typing import Any, Dict, List, Optional, Tuple @@ -411,7 +410,16 @@ class TrainingArguments: self.run_name = self.output_dir if is_torch_available() and self.device.type != "cuda" and self.fp16: - raise ValueError("AMP (`--fp16`) can only be used on CUDA devices.") + raise ValueError("Mixed precision training with AMP or APEX (`--fp16`) can only be used on CUDA devices.") + + def __repr__(self): + # We override the default repr to remove deprecated arguments from the repr. This method should be removed once + # those deprecated arguments are removed form TrainingArguments. (TODO: v5) + self_as_dict = asdict(self) + del self_as_dict["per_gpu_train_batch_size"] + del self_as_dict["per_gpu_eval_batch_size"] + attrs_as_str = [f"{k}={v}" for k, v in self_as_dict.items()] + return f"{self.__class__.__name__}({', '.join(attrs_as_str)})" @property def train_batch_size(self) -> int: @@ -523,7 +531,7 @@ class TrainingArguments: """ Serializes this instance while replace `Enum` by their values (for JSON serialization support). """ - d = dataclasses.asdict(self) + d = asdict(self) for k, v in d.items(): if isinstance(v, Enum): d[k] = v.value diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 29a57ab07f..b95a21c653 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -265,6 +265,21 @@ class TrainerIntegrationTest(unittest.TestCase): metrics = trainer.evaluate() self.assertEqual(metrics[metric], best_value) + def check_trainer_state_are_the_same(self, trainer_state, trainer_state1): + # We'll pop things so operate on copies. + state = trainer_state.copy() + state1 = trainer_state1.copy() + # Log history main contain different logs for the time metrics (after resuming a training). + log_history = state.pop("log_history", None) + log_history1 = state1.pop("log_history", None) + self.assertEqual(state, state1) + for log, log1 in zip(log_history, log_history1): + _ = log.pop("train_runtime", None) + _ = log1.pop("train_runtime", None) + _ = log.pop("train_samples_per_second", None) + _ = log1.pop("train_samples_per_second", None) + self.assertEqual(log, log1) + def test_trainer_works_with_dict(self): # Edge case because Apex with mode O2 will change our models to return dicts. This test checks it doesn't break # anything. @@ -552,7 +567,7 @@ class TrainerIntegrationTest(unittest.TestCase): state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) - self.assertEqual(state, state1) + self.check_trainer_state_are_the_same(state, state1) # Now check with a later checkpoint that it also works when we span over one epoch checkpoint = os.path.join(tmpdir, "checkpoint-15") @@ -566,7 +581,7 @@ class TrainerIntegrationTest(unittest.TestCase): state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) - self.assertEqual(state, state1) + self.check_trainer_state_are_the_same(state, state1) # With a regular model that is not a PreTrainedModel with tempfile.TemporaryDirectory() as tmpdir: @@ -590,7 +605,7 @@ class TrainerIntegrationTest(unittest.TestCase): state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) - self.assertEqual(state, state1) + self.check_trainer_state_are_the_same(state, state1) # Now check with a later checkpoint that it also works when we span over one epoch checkpoint = os.path.join(tmpdir, "checkpoint-15") @@ -606,7 +621,7 @@ class TrainerIntegrationTest(unittest.TestCase): state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) - self.assertEqual(state, state1) + self.check_trainer_state_are_the_same(state, state1) def test_resume_training_with_gradient_accumulation(self): if torch.cuda.device_count() > 2: @@ -638,7 +653,7 @@ class TrainerIntegrationTest(unittest.TestCase): state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) - self.assertEqual(state, state1) + self.check_trainer_state_are_the_same(state, state1) def test_load_best_model_at_end(self): total = int(self.n_epochs * 64 / self.batch_size)