SageMaker: Fix sagemaker DDP & metric logs (#13181)

* Barrier -> barrier

* added logger for metrics

* removed stream handler in trainer

* moved handler

* removed streamhandler from trainer

* updated test image and instance type added datasets version to test

* Update tests/sagemaker/scripts/pytorch/requirements.txt

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
This commit is contained in:
Philipp Schmid
2021-08-23 10:18:07 +02:00
committed by GitHub
parent 8679bd7144
commit f689743e74
6 changed files with 21 additions and 16 deletions

View File

@@ -17,8 +17,8 @@ class SageMakerTestEnvironment:
role = "arn:aws:iam::558105141721:role/sagemaker_execution_role"
hyperparameters = {
"task_name": "mnli",
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 16,
"do_train": True,
"do_eval": True,
"do_predict": True,
@@ -55,9 +55,9 @@ class SageMakerTestEnvironment:
@property
def image_uri(self) -> str:
if self.framework == "pytorch":
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04"
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.7.1-transformers4.6.1-gpu-py36-cu110-ubuntu18.04"
else:
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04"
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.6.1-gpu-py37-cu110-ubuntu18.04"
@pytest.fixture(scope="class")

View File

@@ -1 +1,2 @@
git+https://github.com/huggingface/transformers.git@master # install master or adjust ist with vX.X.X for installing version specific transforms
git+https://github.com/huggingface/transformers.git@master # install master or adjust it with vX.X.X for installing version specific transforms
datasets==1.8.0

View File

@@ -27,21 +27,21 @@ if is_sagemaker_available():
"framework": "pytorch",
"script": "run_glue.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"instance_type": "ml.p3.16xlarge",
"results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
},
{
"framework": "pytorch",
"script": "run_ddp.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"instance_type": "ml.p3.16xlarge",
"results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
},
{
"framework": "tensorflow",
"script": "run_tf_dist.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"instance_type": "ml.p3.16xlarge",
"results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
},
]