From f689743e7454b93f6cab4343026de03fa530bfb9 Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Mon, 23 Aug 2021 10:18:07 +0200 Subject: [PATCH] SageMaker: Fix sagemaker DDP & metric logs (#13181) * Barrier -> barrier * added logger for metrics * removed stream handler in trainer * moved handler * removed streamhandler from trainer * updated test image and instance type added datasets version to test * Update tests/sagemaker/scripts/pytorch/requirements.txt Co-authored-by: Stas Bekman Co-authored-by: Stas Bekman --- src/transformers/trainer.py | 5 ----- src/transformers/trainer_pt_utils.py | 11 ++++++++++- src/transformers/training_args.py | 4 ++-- tests/sagemaker/conftest.py | 8 ++++---- tests/sagemaker/scripts/pytorch/requirements.txt | 3 ++- tests/sagemaker/test_multi_node_data_parallel.py | 6 +++--- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f4ac212a16..f7bb615a94 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -26,7 +26,6 @@ import shutil import sys import time import warnings -from logging import StreamHandler from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union @@ -68,7 +67,6 @@ from .file_utils import ( is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, is_torch_tpu_available, - is_training_run_on_sagemaker, ) from .modelcard import TrainingSummary from .modeling_utils import PreTrainedModel, unwrap_model @@ -173,9 +171,6 @@ if is_sagemaker_mp_enabled(): from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat -if is_training_run_on_sagemaker(): - logging.add_handler(StreamHandler(sys.stdout)) - if TYPE_CHECKING: import optuna diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 2751f7be50..ba0a492975 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -20,9 +20,11 @@ import datetime import json import math import os +import sys import warnings from contextlib import contextmanager from dataclasses import dataclass +from logging import StreamHandler from typing import Dict, Iterator, List, Optional, Union import numpy as np @@ -32,7 +34,12 @@ from torch import nn from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler from torch.utils.data.distributed import DistributedSampler -from .file_utils import is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, is_torch_tpu_available +from .file_utils import ( + is_sagemaker_dp_enabled, + is_sagemaker_mp_enabled, + is_torch_tpu_available, + is_training_run_on_sagemaker, +) from .tokenization_utils_base import BatchEncoding from .utils import logging @@ -42,6 +49,8 @@ if is_sagemaker_dp_enabled(): else: import torch.distributed as dist +if is_training_run_on_sagemaker(): + logging.add_handler(StreamHandler(sys.stdout)) if is_torch_tpu_available(): import torch_xla.core.xla_model as xm diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 404d92a221..5f6b877bb0 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1053,7 +1053,7 @@ class TrainingArguments: if is_torch_tpu_available(): xm.rendezvous(desc) elif is_sagemaker_dp_enabled(): - sm_dist.Barrier() + sm_dist.barrier() else: torch.distributed.barrier() yield @@ -1064,7 +1064,7 @@ class TrainingArguments: if is_torch_tpu_available(): xm.rendezvous(desc) elif is_sagemaker_dp_enabled(): - sm_dist.Barrier() + sm_dist.barrier() else: torch.distributed.barrier() else: diff --git a/tests/sagemaker/conftest.py b/tests/sagemaker/conftest.py index 076e06784b..8e7c0bbf1d 100644 --- a/tests/sagemaker/conftest.py +++ b/tests/sagemaker/conftest.py @@ -17,8 +17,8 @@ class SageMakerTestEnvironment: role = "arn:aws:iam::558105141721:role/sagemaker_execution_role" hyperparameters = { "task_name": "mnli", - "per_device_train_batch_size": 32, - "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 16, + "per_device_eval_batch_size": 16, "do_train": True, "do_eval": True, "do_predict": True, @@ -55,9 +55,9 @@ class SageMakerTestEnvironment: @property def image_uri(self) -> str: if self.framework == "pytorch": - return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04" + return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.7.1-transformers4.6.1-gpu-py36-cu110-ubuntu18.04" else: - return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04" + return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.6.1-gpu-py37-cu110-ubuntu18.04" @pytest.fixture(scope="class") diff --git a/tests/sagemaker/scripts/pytorch/requirements.txt b/tests/sagemaker/scripts/pytorch/requirements.txt index 0194b67c40..4b628c8c94 100644 --- a/tests/sagemaker/scripts/pytorch/requirements.txt +++ b/tests/sagemaker/scripts/pytorch/requirements.txt @@ -1 +1,2 @@ -git+https://github.com/huggingface/transformers.git@master # install master or adjust ist with vX.X.X for installing version specific transforms \ No newline at end of file +git+https://github.com/huggingface/transformers.git@master # install master or adjust it with vX.X.X for installing version specific transforms +datasets==1.8.0 \ No newline at end of file diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py index 0488e4fcf8..8fb60d64a6 100644 --- a/tests/sagemaker/test_multi_node_data_parallel.py +++ b/tests/sagemaker/test_multi_node_data_parallel.py @@ -27,21 +27,21 @@ if is_sagemaker_available(): "framework": "pytorch", "script": "run_glue.py", "model_name_or_path": "distilbert-base-cased", - "instance_type": "ml.p3dn.24xlarge", + "instance_type": "ml.p3.16xlarge", "results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6}, }, { "framework": "pytorch", "script": "run_ddp.py", "model_name_or_path": "distilbert-base-cased", - "instance_type": "ml.p3dn.24xlarge", + "instance_type": "ml.p3.16xlarge", "results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6}, }, { "framework": "tensorflow", "script": "run_tf_dist.py", "model_name_or_path": "distilbert-base-cased", - "instance_type": "ml.p3dn.24xlarge", + "instance_type": "ml.p3.16xlarge", "results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7}, }, ]