SageMaker: Fix sagemaker DDP & metric logs (#13181)
* Barrier -> barrier * added logger for metrics * removed stream handler in trainer * moved handler * removed streamhandler from trainer * updated test image and instance type added datasets version to test * Update tests/sagemaker/scripts/pytorch/requirements.txt Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
This commit is contained in:
@@ -17,8 +17,8 @@ class SageMakerTestEnvironment:
|
||||
role = "arn:aws:iam::558105141721:role/sagemaker_execution_role"
|
||||
hyperparameters = {
|
||||
"task_name": "mnli",
|
||||
"per_device_train_batch_size": 32,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 16,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_predict": True,
|
||||
@@ -55,9 +55,9 @@ class SageMakerTestEnvironment:
|
||||
@property
|
||||
def image_uri(self) -> str:
|
||||
if self.framework == "pytorch":
|
||||
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04"
|
||||
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.7.1-transformers4.6.1-gpu-py36-cu110-ubuntu18.04"
|
||||
else:
|
||||
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04"
|
||||
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.6.1-gpu-py37-cu110-ubuntu18.04"
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
|
||||
Reference in New Issue
Block a user