Migrate HFDeepSpeedConfig from trfrs to accelerate (#17623)
* Migrate HFDeepSpeedConfig from trfrs to accelerate * add `accelerate` to testing dep * addressing comments * addressing comments Using `_shared_state` and avoiding object creation. This is necessary as `notebook_launcher` in `launcers.py` checks `len(AcceleratorState._shared_state)>0` to throw an error. * resolving comments 1. Use simple API from accelerate to manage the deepspeed config integration 2. Update the related documentation * reverting changes and addressing comments * docstring correction * addressing nits * addressing nits * addressing nits 3 * bumping up the accelerate version to 0.10.0 * resolving import * update setup.py to include deepspeed dependencies * Update dependency_versions_table.py * fixing imports * reverting changes to CI dependencies for "run_tests_pipelines_tf*" tests These changes didn't help with resolving the failures and I believe this needs to be addressed in another PR. * removing `accelerate` as hard dependency Resolves issues related to CI Tests * adding `accelerate` as dependency for building docs resolves failure in Build PR Documentation test * adding `accelerate` as dependency in "dev" to resolve doc build issue * resolving comments 1. adding `accelerate` to extras["all"] 2. Including check for accelerate too before import HFDeepSpeedConfig from there Co-Authored-By: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * resolving comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e44a569fef
commit
21a772426d
36
setup.py
36
setup.py
@@ -97,7 +97,7 @@ if stale_egg_info.exists():
|
|||||||
# 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
|
# 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
|
||||||
_deps = [
|
_deps = [
|
||||||
"Pillow",
|
"Pillow",
|
||||||
"accelerate>=0.9.0",
|
"accelerate>=0.10.0",
|
||||||
"black~=22.0,>=22.3",
|
"black~=22.0,>=22.3",
|
||||||
"codecarbon==1.2.0",
|
"codecarbon==1.2.0",
|
||||||
"cookiecutter==1.7.3",
|
"cookiecutter==1.7.3",
|
||||||
@@ -242,6 +242,7 @@ extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx")
|
|||||||
extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx")
|
extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx")
|
||||||
|
|
||||||
extras["torch"] = deps_list("torch")
|
extras["torch"] = deps_list("torch")
|
||||||
|
extras["accelerate"] = deps_list("accelerate")
|
||||||
|
|
||||||
if os.name == "nt": # windows
|
if os.name == "nt": # windows
|
||||||
extras["retrieval"] = deps_list("datasets") # faiss is not supported on windows
|
extras["retrieval"] = deps_list("datasets") # faiss is not supported on windows
|
||||||
@@ -257,7 +258,7 @@ extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxrunt
|
|||||||
extras["modelcreation"] = deps_list("cookiecutter")
|
extras["modelcreation"] = deps_list("cookiecutter")
|
||||||
|
|
||||||
extras["sagemaker"] = deps_list("sagemaker")
|
extras["sagemaker"] = deps_list("sagemaker")
|
||||||
extras["deepspeed"] = deps_list("deepspeed")
|
extras["deepspeed"] = deps_list("deepspeed") + extras["accelerate"]
|
||||||
extras["fairscale"] = deps_list("fairscale")
|
extras["fairscale"] = deps_list("fairscale")
|
||||||
extras["optuna"] = deps_list("optuna")
|
extras["optuna"] = deps_list("optuna")
|
||||||
extras["ray"] = deps_list("ray[tune]")
|
extras["ray"] = deps_list("ray[tune]")
|
||||||
@@ -293,9 +294,9 @@ extras["testing"] = (
|
|||||||
"nltk",
|
"nltk",
|
||||||
"GitPython",
|
"GitPython",
|
||||||
"hf-doc-builder",
|
"hf-doc-builder",
|
||||||
"protobuf", # Can be removed once we can unpin protobuf
|
"protobuf", # Can be removed once we can unpin protobuf
|
||||||
"sacremoses",
|
"sacremoses",
|
||||||
"rjieba"
|
"rjieba",
|
||||||
)
|
)
|
||||||
+ extras["retrieval"]
|
+ extras["retrieval"]
|
||||||
+ extras["modelcreation"]
|
+ extras["modelcreation"]
|
||||||
@@ -316,6 +317,7 @@ extras["all"] = (
|
|||||||
+ extras["integrations"]
|
+ extras["integrations"]
|
||||||
+ extras["timm"]
|
+ extras["timm"]
|
||||||
+ extras["codecarbon"]
|
+ extras["codecarbon"]
|
||||||
|
+ extras["accelerate"]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Might need to add doc-builder and some specific deps in the future
|
# Might need to add doc-builder and some specific deps in the future
|
||||||
@@ -325,8 +327,8 @@ extras["docs_specific"] = ["hf-doc-builder"]
|
|||||||
extras["docs"] = extras["all"] + extras["docs_specific"]
|
extras["docs"] = extras["all"] + extras["docs_specific"]
|
||||||
|
|
||||||
extras["dev-torch"] = (
|
extras["dev-torch"] = (
|
||||||
extras['testing']
|
extras["testing"]
|
||||||
+ extras['torch']
|
+ extras["torch"]
|
||||||
+ extras["sentencepiece"]
|
+ extras["sentencepiece"]
|
||||||
+ extras["tokenizers"]
|
+ extras["tokenizers"]
|
||||||
+ extras["torch-speech"]
|
+ extras["torch-speech"]
|
||||||
@@ -342,17 +344,17 @@ extras["dev-torch"] = (
|
|||||||
+ extras["onnxruntime"]
|
+ extras["onnxruntime"]
|
||||||
)
|
)
|
||||||
extras["dev-tensorflow"] = (
|
extras["dev-tensorflow"] = (
|
||||||
extras['testing']
|
extras["testing"]
|
||||||
+ extras['tf']
|
+ extras["tf"]
|
||||||
+ extras["sentencepiece"]
|
+ extras["sentencepiece"]
|
||||||
+ extras["tokenizers"]
|
+ extras["tokenizers"]
|
||||||
+ extras["vision"]
|
+ extras["vision"]
|
||||||
+ extras["quality"]
|
+ extras["quality"]
|
||||||
+ extras["docs_specific"]
|
+ extras["docs_specific"]
|
||||||
+ extras["sklearn"]
|
+ extras["sklearn"]
|
||||||
+ extras["modelcreation"]
|
+ extras["modelcreation"]
|
||||||
+ extras["onnx"]
|
+ extras["onnx"]
|
||||||
+ extras["tf-speech"]
|
+ extras["tf-speech"]
|
||||||
)
|
)
|
||||||
extras["dev"] = (
|
extras["dev"] = (
|
||||||
extras["all"]
|
extras["all"]
|
||||||
|
|||||||
@@ -16,14 +16,12 @@ Integration with Deepspeed
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import io
|
|
||||||
import json
|
|
||||||
import weakref
|
import weakref
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from functools import partialmethod
|
from functools import partialmethod
|
||||||
|
|
||||||
from .dependency_versions_check import dep_version_check
|
from .dependency_versions_check import dep_version_check
|
||||||
from .utils import is_torch_available, logging
|
from .utils import is_accelerate_available, is_torch_available, logging
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@@ -36,7 +34,15 @@ def is_deepspeed_available():
|
|||||||
return importlib.util.find_spec("deepspeed") is not None
|
return importlib.util.find_spec("deepspeed") is not None
|
||||||
|
|
||||||
|
|
||||||
class HfDeepSpeedConfig:
|
if is_accelerate_available() and is_deepspeed_available():
|
||||||
|
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
|
||||||
|
else:
|
||||||
|
# Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
|
||||||
|
# Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
|
||||||
|
from builtins import object as DeepSpeedConfig
|
||||||
|
|
||||||
|
|
||||||
|
class HfDeepSpeedConfig(DeepSpeedConfig):
|
||||||
"""
|
"""
|
||||||
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
|
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
|
||||||
|
|
||||||
@@ -56,108 +62,9 @@ class HfDeepSpeedConfig:
|
|||||||
def __init__(self, config_file_or_dict):
|
def __init__(self, config_file_or_dict):
|
||||||
# set global weakref object
|
# set global weakref object
|
||||||
set_hf_deepspeed_config(self)
|
set_hf_deepspeed_config(self)
|
||||||
|
dep_version_check("accelerate")
|
||||||
dep_version_check("deepspeed")
|
dep_version_check("deepspeed")
|
||||||
|
super().__init__(config_file_or_dict)
|
||||||
if isinstance(config_file_or_dict, dict):
|
|
||||||
# Don't modify user's data should they want to reuse it (e.g. in tests), because once we
|
|
||||||
# modified it, it will not be accepted here again, since `auto` values would have been overridden
|
|
||||||
config = deepcopy(config_file_or_dict)
|
|
||||||
elif isinstance(config_file_or_dict, str):
|
|
||||||
with io.open(config_file_or_dict, "r", encoding="utf-8") as f:
|
|
||||||
config = json.load(f)
|
|
||||||
else:
|
|
||||||
raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict")
|
|
||||||
self.config = config
|
|
||||||
|
|
||||||
# zero stage - this is done as early as possible, before model is created, to allow
|
|
||||||
# ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
|
|
||||||
# during ``zero.Init()`` which needs to know the dtype, and some other hparams.
|
|
||||||
self._stage = self.get_value("zero_optimization.stage", -1)
|
|
||||||
|
|
||||||
# offload
|
|
||||||
self._offload = False
|
|
||||||
if self.is_zero2() or self.is_zero3():
|
|
||||||
offload_devices_valid = set(["cpu", "nvme"])
|
|
||||||
offload_devices = set(
|
|
||||||
[
|
|
||||||
self.get_value("zero_optimization.offload_optimizer.device"),
|
|
||||||
self.get_value("zero_optimization.offload_param.device"),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
if len(offload_devices & offload_devices_valid) > 0:
|
|
||||||
self._offload = True
|
|
||||||
|
|
||||||
def find_config_node(self, ds_key_long):
|
|
||||||
config = self.config
|
|
||||||
|
|
||||||
# find the config node of interest if it exists
|
|
||||||
nodes = ds_key_long.split(".")
|
|
||||||
ds_key = nodes.pop()
|
|
||||||
for node in nodes:
|
|
||||||
config = config.get(node)
|
|
||||||
if config is None:
|
|
||||||
return None, ds_key
|
|
||||||
|
|
||||||
return config, ds_key
|
|
||||||
|
|
||||||
def get_value(self, ds_key_long, default=None):
|
|
||||||
"""
|
|
||||||
Returns the set value or `default` if no value is set
|
|
||||||
"""
|
|
||||||
config, ds_key = self.find_config_node(ds_key_long)
|
|
||||||
if config is None:
|
|
||||||
return default
|
|
||||||
return config.get(ds_key, default)
|
|
||||||
|
|
||||||
def del_config_sub_tree(self, ds_key_long, must_exist=False):
|
|
||||||
"""
|
|
||||||
Deletes a sub-section of the config file if it's found.
|
|
||||||
|
|
||||||
Unless `must_exist` is `True` the section doesn't have to exist.
|
|
||||||
"""
|
|
||||||
config = self.config
|
|
||||||
|
|
||||||
# find the config node of interest if it exists
|
|
||||||
nodes = ds_key_long.split(".")
|
|
||||||
for node in nodes:
|
|
||||||
parent_config = config
|
|
||||||
config = config.get(node)
|
|
||||||
if config is None:
|
|
||||||
if must_exist:
|
|
||||||
raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}")
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
|
|
||||||
# if found remove it
|
|
||||||
if parent_config is not None:
|
|
||||||
parent_config.pop(node)
|
|
||||||
|
|
||||||
def is_true(self, ds_key_long):
|
|
||||||
"""
|
|
||||||
Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
|
|
||||||
specific question of whether the value is set to `True` (and it's not set to `False`` or isn't set).
|
|
||||||
|
|
||||||
"""
|
|
||||||
value = self.get_value(ds_key_long)
|
|
||||||
return False if value is None else bool(value)
|
|
||||||
|
|
||||||
def is_false(self, ds_key_long):
|
|
||||||
"""
|
|
||||||
Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
|
|
||||||
specific question of whether the value is set to `False` (and it's not set to `True`` or isn't set).
|
|
||||||
"""
|
|
||||||
value = self.get_value(ds_key_long)
|
|
||||||
return False if value is None else not bool(value)
|
|
||||||
|
|
||||||
def is_zero2(self):
|
|
||||||
return self._stage == 2
|
|
||||||
|
|
||||||
def is_zero3(self):
|
|
||||||
return self._stage == 3
|
|
||||||
|
|
||||||
def is_offload(self):
|
|
||||||
return self._offload
|
|
||||||
|
|
||||||
|
|
||||||
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
|
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# 2. run `make deps_table_update``
|
# 2. run `make deps_table_update``
|
||||||
deps = {
|
deps = {
|
||||||
"Pillow": "Pillow",
|
"Pillow": "Pillow",
|
||||||
"accelerate": "accelerate>=0.9.0",
|
"accelerate": "accelerate>=0.10.0",
|
||||||
"black": "black~=22.0,>=22.3",
|
"black": "black~=22.0,>=22.3",
|
||||||
"codecarbon": "codecarbon==1.2.0",
|
"codecarbon": "codecarbon==1.2.0",
|
||||||
"cookiecutter": "cookiecutter==1.7.3",
|
"cookiecutter": "cookiecutter==1.7.3",
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ from .utils import (
|
|||||||
ExplicitEnum,
|
ExplicitEnum,
|
||||||
cached_property,
|
cached_property,
|
||||||
get_full_repo_name,
|
get_full_repo_name,
|
||||||
|
is_accelerate_available,
|
||||||
is_sagemaker_dp_enabled,
|
is_sagemaker_dp_enabled,
|
||||||
is_sagemaker_mp_enabled,
|
is_sagemaker_mp_enabled,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
@@ -1163,6 +1164,8 @@ class TrainingArguments:
|
|||||||
if self.deepspeed:
|
if self.deepspeed:
|
||||||
# - must be run very last in arg parsing, since it will use a lot of these settings.
|
# - must be run very last in arg parsing, since it will use a lot of these settings.
|
||||||
# - must be run before the model is created.
|
# - must be run before the model is created.
|
||||||
|
if not is_accelerate_available():
|
||||||
|
raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
|
||||||
from transformers.deepspeed import HfTrainerDeepSpeedConfig
|
from transformers.deepspeed import HfTrainerDeepSpeedConfig
|
||||||
|
|
||||||
# will be used later by the Trainer
|
# will be used later by the Trainer
|
||||||
|
|||||||
Reference in New Issue
Block a user