added json dump and extraction of train run time (#11167)

* added json dump and extraction of train run time

* make style happy
This commit is contained in:
Philipp Schmid
2021-04-09 21:18:00 +02:00
committed by GitHub
parent 07f0bb691d
commit 6f90c29eaa
3 changed files with 43 additions and 24 deletions

View File

@@ -1,3 +1,4 @@
import json
import os
import subprocess
import unittest
@@ -11,7 +12,7 @@ from . import is_sagemaker_available
if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics
from sagemaker import Session, TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace
@@ -27,21 +28,21 @@ if is_sagemaker_available():
"script": "run_glue.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
"results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
},
{
"framework": "pytorch",
"script": "run_ddp.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
"results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
},
{
"framework": "tensorflow",
"script": "run_tf_dist.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 500, "eval_accuracy": 0.6, "eval_loss": 0.7},
"results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
},
]
)
@@ -88,17 +89,22 @@ class MultiNodeTest(unittest.TestCase):
# run training
estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# get train time from SageMaker job, this includes starting, preprocessing, stopping
train_runtime = (
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
)
# assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime)
assert any(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert train_runtime <= self.results["train_runtime"]
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss)
# dump tests result into json file to share in PR
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)