From 6f90c29eaaba898919b7689ab7e2cfce1604cdb8 Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Fri, 9 Apr 2021 21:18:00 +0200 Subject: [PATCH] added json dump and extraction of train run time (#11167) * added json dump and extraction of train run time * make style happy --- .../test_multi_node_data_parallel.py | 24 ++++++++++++------- .../test_multi_node_model_parallel.py | 23 +++++++++++------- tests/sagemaker/test_single_node_gpu.py | 20 ++++++++++------ 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py index 67d8dcd70d..0a826f4b15 100644 --- a/tests/sagemaker/test_multi_node_data_parallel.py +++ b/tests/sagemaker/test_multi_node_data_parallel.py @@ -1,3 +1,4 @@ +import json import os import subprocess import unittest @@ -11,7 +12,7 @@ from . import is_sagemaker_available if is_sagemaker_available(): - from sagemaker import TrainingJobAnalytics + from sagemaker import Session, TrainingJobAnalytics from sagemaker.huggingface import HuggingFace @@ -27,21 +28,21 @@ if is_sagemaker_available(): "script": "run_glue.py", "model_name_or_path": "distilbert-base-cased", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6}, + "results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6}, }, { "framework": "pytorch", "script": "run_ddp.py", "model_name_or_path": "distilbert-base-cased", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6}, + "results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6}, }, { "framework": "tensorflow", "script": "run_tf_dist.py", "model_name_or_path": "distilbert-base-cased", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 500, "eval_accuracy": 0.6, "eval_loss": 0.7}, + "results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7}, }, ] ) @@ -88,17 +89,22 @@ class MultiNodeTest(unittest.TestCase): # run training estimator.fit() - # save csv - self.save_results_as_csv(estimator.latest_training_job.name) # result dataframe result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() # extract kpis - train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) + # get train time from SageMaker job, this includes starting, preprocessing, stopping + train_runtime = ( + Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999) + ) # assert kpis - assert all(t <= self.results["train_runtime"] for t in train_runtime) - assert any(t >= self.results["eval_accuracy"] for t in eval_accuracy) + assert train_runtime <= self.results["train_runtime"] + assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t <= self.results["eval_loss"] for t in eval_loss) + + # dump tests result into json file to share in PR + with open(f"{estimator.latest_training_job.name}.json", "w") as outfile: + json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile) diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py index 3135573653..a59c207fb0 100644 --- a/tests/sagemaker/test_multi_node_model_parallel.py +++ b/tests/sagemaker/test_multi_node_model_parallel.py @@ -1,3 +1,4 @@ +import json import os import subprocess import unittest @@ -11,7 +12,7 @@ from . import is_sagemaker_available if is_sagemaker_available(): - from sagemaker import TrainingJobAnalytics + from sagemaker import Session, TrainingJobAnalytics from sagemaker.huggingface import HuggingFace @@ -27,14 +28,14 @@ if is_sagemaker_available(): "script": "run_glue_model_parallelism.py", "model_name_or_path": "roberta-large", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2}, + "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2}, }, { "framework": "pytorch", "script": "run_glue.py", "model_name_or_path": "roberta-large", "instance_type": "ml.p3dn.24xlarge", - "results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2}, + "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2}, }, ] ) @@ -69,13 +70,14 @@ class MultiNodeTest(unittest.TestCase): distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options} + name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer" # creates estimator return HuggingFace( entry_point=self.script, source_dir=self.env.test_path, role=self.env.role, image_uri=self.env.image_uri, - base_job_name=f"{self.env.base_job_name}-{instance_count}-smp", + base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}", instance_count=instance_count, instance_type=self.instance_type, debugger_hook_config=False, @@ -101,17 +103,22 @@ class MultiNodeTest(unittest.TestCase): # run training estimator.fit() - # save csv - self.save_results_as_csv(estimator.latest_training_job.name) # result dataframe result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() # extract kpis - train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) + # get train time from SageMaker job, this includes starting, preprocessing, stopping + train_runtime = ( + Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999) + ) # assert kpis - assert all(t <= self.results["train_runtime"] for t in train_runtime) + assert train_runtime <= self.results["train_runtime"] assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t <= self.results["eval_loss"] for t in eval_loss) + + # dump tests result into json file to share in PR + with open(f"{estimator.latest_training_job.name}.json", "w") as outfile: + json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile) diff --git a/tests/sagemaker/test_single_node_gpu.py b/tests/sagemaker/test_single_node_gpu.py index aa08bd0641..71bf9d0928 100644 --- a/tests/sagemaker/test_single_node_gpu.py +++ b/tests/sagemaker/test_single_node_gpu.py @@ -1,3 +1,4 @@ +import json import os import subprocess import unittest @@ -11,7 +12,7 @@ from . import is_sagemaker_available if is_sagemaker_available(): - from sagemaker import TrainingJobAnalytics + from sagemaker import Session, TrainingJobAnalytics from sagemaker.huggingface import HuggingFace @@ -27,14 +28,14 @@ if is_sagemaker_available(): "script": "run_glue.py", "model_name_or_path": "distilbert-base-cased", "instance_type": "ml.g4dn.xlarge", - "results": {"train_runtime": 200, "eval_accuracy": 0.6, "eval_loss": 0.9}, + "results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9}, }, { "framework": "tensorflow", "script": "run_tf.py", "model_name_or_path": "distilbert-base-cased", "instance_type": "ml.g4dn.xlarge", - "results": {"train_runtime": 350, "eval_accuracy": 0.3, "eval_loss": 0.9}, + "results": {"train_runtime": 600, "eval_accuracy": 0.3, "eval_loss": 0.9}, }, ] ) @@ -74,17 +75,22 @@ class SingleNodeTest(unittest.TestCase): # run training estimator.fit() - # save csv - self.save_results_as_csv(estimator.latest_training_job.name) # result dataframe result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() # extract kpis - train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) + # get train time from SageMaker job, this includes starting, preprocessing, stopping + train_runtime = ( + Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999) + ) # assert kpis - assert all(t <= self.results["train_runtime"] for t in train_runtime) + assert train_runtime <= self.results["train_runtime"] assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t <= self.results["eval_loss"] for t in eval_loss) + + # dump tests result into json file to share in PR + with open(f"{estimator.latest_training_job.name}.json", "w") as outfile: + json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)