From 21a772426dee10003fb0111abec514c9dcefda35 Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Fri, 17 Jun 2022 23:29:35 +0530 Subject: [PATCH] Migrate HFDeepSpeedConfig from trfrs to accelerate (#17623) * Migrate HFDeepSpeedConfig from trfrs to accelerate * add `accelerate` to testing dep * addressing comments * addressing comments Using `_shared_state` and avoiding object creation. This is necessary as `notebook_launcher` in `launcers.py` checks `len(AcceleratorState._shared_state)>0` to throw an error. * resolving comments 1. Use simple API from accelerate to manage the deepspeed config integration 2. Update the related documentation * reverting changes and addressing comments * docstring correction * addressing nits * addressing nits * addressing nits 3 * bumping up the accelerate version to 0.10.0 * resolving import * update setup.py to include deepspeed dependencies * Update dependency_versions_table.py * fixing imports * reverting changes to CI dependencies for "run_tests_pipelines_tf*" tests These changes didn't help with resolving the failures and I believe this needs to be addressed in another PR. * removing `accelerate` as hard dependency Resolves issues related to CI Tests * adding `accelerate` as dependency for building docs resolves failure in Build PR Documentation test * adding `accelerate` as dependency in "dev" to resolve doc build issue * resolving comments 1. adding `accelerate` to extras["all"] 2. Including check for accelerate too before import HFDeepSpeedConfig from there Co-Authored-By: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * resolving comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- setup.py | 36 +++--- src/transformers/deepspeed.py | 117 ++---------------- src/transformers/dependency_versions_table.py | 2 +- src/transformers/training_args.py | 3 + 4 files changed, 35 insertions(+), 123 deletions(-) diff --git a/setup.py b/setup.py index 668ec30024..e73fca64d1 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,7 @@ if stale_egg_info.exists(): # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py _deps = [ "Pillow", - "accelerate>=0.9.0", + "accelerate>=0.10.0", "black~=22.0,>=22.3", "codecarbon==1.2.0", "cookiecutter==1.7.3", @@ -242,6 +242,7 @@ extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx") extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx") extras["torch"] = deps_list("torch") +extras["accelerate"] = deps_list("accelerate") if os.name == "nt": # windows extras["retrieval"] = deps_list("datasets") # faiss is not supported on windows @@ -257,7 +258,7 @@ extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxrunt extras["modelcreation"] = deps_list("cookiecutter") extras["sagemaker"] = deps_list("sagemaker") -extras["deepspeed"] = deps_list("deepspeed") +extras["deepspeed"] = deps_list("deepspeed") + extras["accelerate"] extras["fairscale"] = deps_list("fairscale") extras["optuna"] = deps_list("optuna") extras["ray"] = deps_list("ray[tune]") @@ -293,9 +294,9 @@ extras["testing"] = ( "nltk", "GitPython", "hf-doc-builder", - "protobuf", # Can be removed once we can unpin protobuf + "protobuf", # Can be removed once we can unpin protobuf "sacremoses", - "rjieba" + "rjieba", ) + extras["retrieval"] + extras["modelcreation"] @@ -316,6 +317,7 @@ extras["all"] = ( + extras["integrations"] + extras["timm"] + extras["codecarbon"] + + extras["accelerate"] ) # Might need to add doc-builder and some specific deps in the future @@ -325,8 +327,8 @@ extras["docs_specific"] = ["hf-doc-builder"] extras["docs"] = extras["all"] + extras["docs_specific"] extras["dev-torch"] = ( - extras['testing'] - + extras['torch'] + extras["testing"] + + extras["torch"] + extras["sentencepiece"] + extras["tokenizers"] + extras["torch-speech"] @@ -342,17 +344,17 @@ extras["dev-torch"] = ( + extras["onnxruntime"] ) extras["dev-tensorflow"] = ( - extras['testing'] - + extras['tf'] - + extras["sentencepiece"] - + extras["tokenizers"] - + extras["vision"] - + extras["quality"] - + extras["docs_specific"] - + extras["sklearn"] - + extras["modelcreation"] - + extras["onnx"] - + extras["tf-speech"] + extras["testing"] + + extras["tf"] + + extras["sentencepiece"] + + extras["tokenizers"] + + extras["vision"] + + extras["quality"] + + extras["docs_specific"] + + extras["sklearn"] + + extras["modelcreation"] + + extras["onnx"] + + extras["tf-speech"] ) extras["dev"] = ( extras["all"] diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index 9fa22d4629..9465307f5f 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -16,14 +16,12 @@ Integration with Deepspeed """ import importlib.util -import io -import json import weakref from copy import deepcopy from functools import partialmethod from .dependency_versions_check import dep_version_check -from .utils import is_torch_available, logging +from .utils import is_accelerate_available, is_torch_available, logging if is_torch_available(): @@ -36,7 +34,15 @@ def is_deepspeed_available(): return importlib.util.find_spec("deepspeed") is not None -class HfDeepSpeedConfig: +if is_accelerate_available() and is_deepspeed_available(): + from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig +else: + # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file. + # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available. + from builtins import object as DeepSpeedConfig + + +class HfDeepSpeedConfig(DeepSpeedConfig): """ This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. @@ -56,108 +62,9 @@ class HfDeepSpeedConfig: def __init__(self, config_file_or_dict): # set global weakref object set_hf_deepspeed_config(self) - + dep_version_check("accelerate") dep_version_check("deepspeed") - - if isinstance(config_file_or_dict, dict): - # Don't modify user's data should they want to reuse it (e.g. in tests), because once we - # modified it, it will not be accepted here again, since `auto` values would have been overridden - config = deepcopy(config_file_or_dict) - elif isinstance(config_file_or_dict, str): - with io.open(config_file_or_dict, "r", encoding="utf-8") as f: - config = json.load(f) - else: - raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict") - self.config = config - - # zero stage - this is done as early as possible, before model is created, to allow - # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object - # during ``zero.Init()`` which needs to know the dtype, and some other hparams. - self._stage = self.get_value("zero_optimization.stage", -1) - - # offload - self._offload = False - if self.is_zero2() or self.is_zero3(): - offload_devices_valid = set(["cpu", "nvme"]) - offload_devices = set( - [ - self.get_value("zero_optimization.offload_optimizer.device"), - self.get_value("zero_optimization.offload_param.device"), - ] - ) - if len(offload_devices & offload_devices_valid) > 0: - self._offload = True - - def find_config_node(self, ds_key_long): - config = self.config - - # find the config node of interest if it exists - nodes = ds_key_long.split(".") - ds_key = nodes.pop() - for node in nodes: - config = config.get(node) - if config is None: - return None, ds_key - - return config, ds_key - - def get_value(self, ds_key_long, default=None): - """ - Returns the set value or `default` if no value is set - """ - config, ds_key = self.find_config_node(ds_key_long) - if config is None: - return default - return config.get(ds_key, default) - - def del_config_sub_tree(self, ds_key_long, must_exist=False): - """ - Deletes a sub-section of the config file if it's found. - - Unless `must_exist` is `True` the section doesn't have to exist. - """ - config = self.config - - # find the config node of interest if it exists - nodes = ds_key_long.split(".") - for node in nodes: - parent_config = config - config = config.get(node) - if config is None: - if must_exist: - raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}") - else: - return - - # if found remove it - if parent_config is not None: - parent_config.pop(node) - - def is_true(self, ds_key_long): - """ - Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very - specific question of whether the value is set to `True` (and it's not set to `False`` or isn't set). - - """ - value = self.get_value(ds_key_long) - return False if value is None else bool(value) - - def is_false(self, ds_key_long): - """ - Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very - specific question of whether the value is set to `False` (and it's not set to `True`` or isn't set). - """ - value = self.get_value(ds_key_long) - return False if value is None else not bool(value) - - def is_zero2(self): - return self._stage == 2 - - def is_zero3(self): - return self._stage == 3 - - def is_offload(self): - return self._offload + super().__init__(config_file_or_dict) class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 0fcc39dbea..e0da1a303f 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -3,7 +3,7 @@ # 2. run `make deps_table_update`` deps = { "Pillow": "Pillow", - "accelerate": "accelerate>=0.9.0", + "accelerate": "accelerate>=0.10.0", "black": "black~=22.0,>=22.3", "codecarbon": "codecarbon==1.2.0", "cookiecutter": "cookiecutter==1.7.3", diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 7d7f68dfc8..d80bc6737c 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -35,6 +35,7 @@ from .utils import ( ExplicitEnum, cached_property, get_full_repo_name, + is_accelerate_available, is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, is_torch_available, @@ -1163,6 +1164,8 @@ class TrainingArguments: if self.deepspeed: # - must be run very last in arg parsing, since it will use a lot of these settings. # - must be run before the model is created. + if not is_accelerate_available(): + raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.") from transformers.deepspeed import HfTrainerDeepSpeedConfig # will be used later by the Trainer