added json dump and extraction of train run time (#11167)
* added json dump and extraction of train run time * make style happy
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import unittest
|
import unittest
|
||||||
@@ -11,7 +12,7 @@ from . import is_sagemaker_available
|
|||||||
|
|
||||||
|
|
||||||
if is_sagemaker_available():
|
if is_sagemaker_available():
|
||||||
from sagemaker import TrainingJobAnalytics
|
from sagemaker import Session, TrainingJobAnalytics
|
||||||
from sagemaker.huggingface import HuggingFace
|
from sagemaker.huggingface import HuggingFace
|
||||||
|
|
||||||
|
|
||||||
@@ -27,21 +28,21 @@ if is_sagemaker_available():
|
|||||||
"script": "run_glue.py",
|
"script": "run_glue.py",
|
||||||
"model_name_or_path": "distilbert-base-cased",
|
"model_name_or_path": "distilbert-base-cased",
|
||||||
"instance_type": "ml.p3dn.24xlarge",
|
"instance_type": "ml.p3dn.24xlarge",
|
||||||
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
|
"results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"framework": "pytorch",
|
"framework": "pytorch",
|
||||||
"script": "run_ddp.py",
|
"script": "run_ddp.py",
|
||||||
"model_name_or_path": "distilbert-base-cased",
|
"model_name_or_path": "distilbert-base-cased",
|
||||||
"instance_type": "ml.p3dn.24xlarge",
|
"instance_type": "ml.p3dn.24xlarge",
|
||||||
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
|
"results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"framework": "tensorflow",
|
"framework": "tensorflow",
|
||||||
"script": "run_tf_dist.py",
|
"script": "run_tf_dist.py",
|
||||||
"model_name_or_path": "distilbert-base-cased",
|
"model_name_or_path": "distilbert-base-cased",
|
||||||
"instance_type": "ml.p3dn.24xlarge",
|
"instance_type": "ml.p3dn.24xlarge",
|
||||||
"results": {"train_runtime": 500, "eval_accuracy": 0.6, "eval_loss": 0.7},
|
"results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -88,17 +89,22 @@ class MultiNodeTest(unittest.TestCase):
|
|||||||
# run training
|
# run training
|
||||||
estimator.fit()
|
estimator.fit()
|
||||||
|
|
||||||
# save csv
|
|
||||||
self.save_results_as_csv(estimator.latest_training_job.name)
|
|
||||||
# result dataframe
|
# result dataframe
|
||||||
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
||||||
|
|
||||||
# extract kpis
|
# extract kpis
|
||||||
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
|
|
||||||
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
||||||
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
||||||
|
# get train time from SageMaker job, this includes starting, preprocessing, stopping
|
||||||
|
train_runtime = (
|
||||||
|
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
|
||||||
|
)
|
||||||
|
|
||||||
# assert kpis
|
# assert kpis
|
||||||
assert all(t <= self.results["train_runtime"] for t in train_runtime)
|
assert train_runtime <= self.results["train_runtime"]
|
||||||
assert any(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
||||||
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
||||||
|
|
||||||
|
# dump tests result into json file to share in PR
|
||||||
|
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
|
||||||
|
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import unittest
|
import unittest
|
||||||
@@ -11,7 +12,7 @@ from . import is_sagemaker_available
|
|||||||
|
|
||||||
|
|
||||||
if is_sagemaker_available():
|
if is_sagemaker_available():
|
||||||
from sagemaker import TrainingJobAnalytics
|
from sagemaker import Session, TrainingJobAnalytics
|
||||||
from sagemaker.huggingface import HuggingFace
|
from sagemaker.huggingface import HuggingFace
|
||||||
|
|
||||||
|
|
||||||
@@ -27,14 +28,14 @@ if is_sagemaker_available():
|
|||||||
"script": "run_glue_model_parallelism.py",
|
"script": "run_glue_model_parallelism.py",
|
||||||
"model_name_or_path": "roberta-large",
|
"model_name_or_path": "roberta-large",
|
||||||
"instance_type": "ml.p3dn.24xlarge",
|
"instance_type": "ml.p3dn.24xlarge",
|
||||||
"results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2},
|
"results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"framework": "pytorch",
|
"framework": "pytorch",
|
||||||
"script": "run_glue.py",
|
"script": "run_glue.py",
|
||||||
"model_name_or_path": "roberta-large",
|
"model_name_or_path": "roberta-large",
|
||||||
"instance_type": "ml.p3dn.24xlarge",
|
"instance_type": "ml.p3dn.24xlarge",
|
||||||
"results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2},
|
"results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2},
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -69,13 +70,14 @@ class MultiNodeTest(unittest.TestCase):
|
|||||||
|
|
||||||
distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
|
distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
|
||||||
|
|
||||||
|
name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"
|
||||||
# creates estimator
|
# creates estimator
|
||||||
return HuggingFace(
|
return HuggingFace(
|
||||||
entry_point=self.script,
|
entry_point=self.script,
|
||||||
source_dir=self.env.test_path,
|
source_dir=self.env.test_path,
|
||||||
role=self.env.role,
|
role=self.env.role,
|
||||||
image_uri=self.env.image_uri,
|
image_uri=self.env.image_uri,
|
||||||
base_job_name=f"{self.env.base_job_name}-{instance_count}-smp",
|
base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",
|
||||||
instance_count=instance_count,
|
instance_count=instance_count,
|
||||||
instance_type=self.instance_type,
|
instance_type=self.instance_type,
|
||||||
debugger_hook_config=False,
|
debugger_hook_config=False,
|
||||||
@@ -101,17 +103,22 @@ class MultiNodeTest(unittest.TestCase):
|
|||||||
# run training
|
# run training
|
||||||
estimator.fit()
|
estimator.fit()
|
||||||
|
|
||||||
# save csv
|
|
||||||
self.save_results_as_csv(estimator.latest_training_job.name)
|
|
||||||
# result dataframe
|
# result dataframe
|
||||||
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
||||||
|
|
||||||
# extract kpis
|
# extract kpis
|
||||||
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
|
|
||||||
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
||||||
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
||||||
|
# get train time from SageMaker job, this includes starting, preprocessing, stopping
|
||||||
|
train_runtime = (
|
||||||
|
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
|
||||||
|
)
|
||||||
|
|
||||||
# assert kpis
|
# assert kpis
|
||||||
assert all(t <= self.results["train_runtime"] for t in train_runtime)
|
assert train_runtime <= self.results["train_runtime"]
|
||||||
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
||||||
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
||||||
|
|
||||||
|
# dump tests result into json file to share in PR
|
||||||
|
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
|
||||||
|
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import unittest
|
import unittest
|
||||||
@@ -11,7 +12,7 @@ from . import is_sagemaker_available
|
|||||||
|
|
||||||
|
|
||||||
if is_sagemaker_available():
|
if is_sagemaker_available():
|
||||||
from sagemaker import TrainingJobAnalytics
|
from sagemaker import Session, TrainingJobAnalytics
|
||||||
from sagemaker.huggingface import HuggingFace
|
from sagemaker.huggingface import HuggingFace
|
||||||
|
|
||||||
|
|
||||||
@@ -27,14 +28,14 @@ if is_sagemaker_available():
|
|||||||
"script": "run_glue.py",
|
"script": "run_glue.py",
|
||||||
"model_name_or_path": "distilbert-base-cased",
|
"model_name_or_path": "distilbert-base-cased",
|
||||||
"instance_type": "ml.g4dn.xlarge",
|
"instance_type": "ml.g4dn.xlarge",
|
||||||
"results": {"train_runtime": 200, "eval_accuracy": 0.6, "eval_loss": 0.9},
|
"results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"framework": "tensorflow",
|
"framework": "tensorflow",
|
||||||
"script": "run_tf.py",
|
"script": "run_tf.py",
|
||||||
"model_name_or_path": "distilbert-base-cased",
|
"model_name_or_path": "distilbert-base-cased",
|
||||||
"instance_type": "ml.g4dn.xlarge",
|
"instance_type": "ml.g4dn.xlarge",
|
||||||
"results": {"train_runtime": 350, "eval_accuracy": 0.3, "eval_loss": 0.9},
|
"results": {"train_runtime": 600, "eval_accuracy": 0.3, "eval_loss": 0.9},
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -74,17 +75,22 @@ class SingleNodeTest(unittest.TestCase):
|
|||||||
# run training
|
# run training
|
||||||
estimator.fit()
|
estimator.fit()
|
||||||
|
|
||||||
# save csv
|
|
||||||
self.save_results_as_csv(estimator.latest_training_job.name)
|
|
||||||
# result dataframe
|
# result dataframe
|
||||||
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
||||||
|
|
||||||
# extract kpis
|
# extract kpis
|
||||||
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
|
|
||||||
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
||||||
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
||||||
|
# get train time from SageMaker job, this includes starting, preprocessing, stopping
|
||||||
|
train_runtime = (
|
||||||
|
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
|
||||||
|
)
|
||||||
|
|
||||||
# assert kpis
|
# assert kpis
|
||||||
assert all(t <= self.results["train_runtime"] for t in train_runtime)
|
assert train_runtime <= self.results["train_runtime"]
|
||||||
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
||||||
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
||||||
|
|
||||||
|
# dump tests result into json file to share in PR
|
||||||
|
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
|
||||||
|
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
|
||||||
|
|||||||
Reference in New Issue
Block a user