diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst index 4677d0e1d2..d3cc4b92fa 100644 --- a/docs/source/main_classes/deepspeed.rst +++ b/docs/source/main_classes/deepspeed.rst @@ -10,9 +10,1565 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -HfDeepSpeedConfig + +DeepSpeed Integration ----------------------------------------------------------------------------------------------------------------------- + +`DeepSpeed `__ implements everything described in the `ZeRO paper +`__. Currently it provides full support for: + +1. Optimizer state partitioning (ZeRO stage 1) +2. Gradient partitioning (ZeRO stage 2) +3. Parameter partitioning (ZeRO stage 3) +4. Custom mixed precision training handling +5. A range of fast CUDA-extension-based optimizers +6. ZeRO-Offload to CPU and NVMe + +ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training +`__. And NVMe-support is described in the paper `ZeRO-Infinity: Breaking the GPU +Memory Wall for Extreme Scale Deep Learning `__. + +DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference. + +DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which +won't be possible on a single GPU. + + + +🤗 Transformers integrates `DeepSpeed `__ via 2 options: + +1. Integration of the core DeepSpeed features via :class:`~transformers.Trainer`. This is everything done for you type + of integration - just supply your custom config file or use our template and you have nothing else to do. Most of + this document is focused on this feature. +2. If you don't use :class:`~transformers.Trainer` and want to use your own Trainer where you integrated DeepSpeed + yourself, core functionality functions like ``from_pretrained`` and ``from_config`` include integration of essential + parts of DeepSpeed like ``zero.Init`` for ZeRO stage 3 and higher. To tap into this feature read the docs on + :ref:`deepspeed-non-trainer-integration`. + + + + +.. _deepspeed-trainer-integration: + + +Trainer Deepspeed Integration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. _deepspeed-installation: + +Installation +======================================================================================================================= + +Install the library via pypi: + +.. code-block:: bash + + pip install deepspeed + +or via ``transformers``' ``extras``: + +.. code-block:: bash + + pip install transformers[deepspeed] + +(will become available starting from ``transformers==4.6.0``) + +or find more details on `the DeepSpeed's GitHub page `__ and +`advanced install `__. + +If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`. + +If you don't prebuild the extensions and rely on them to be built at run time and you tried all of the above solutions +to no avail, the next thing to try is to pre-build the modules before installing them. + +To make a local build for DeepSpeed: + +.. code-block:: bash + + git clone https://github.com/microsoft/DeepSpeed/ + cd DeepSpeed + rm -rf build + TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \ + --global-option="build_ext" --global-option="-j8" --no-cache -v \ + --disable-pip-version-check 2>&1 | tee build.log + +Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use. + +Or if you need to use the same setup on multiple machines, make a binary wheel: + +.. code-block:: bash + + git clone https://github.com/microsoft/DeepSpeed/ + cd DeepSpeed + rm -rf build + TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \ + python setup.py build_ext -j8 bdist_wheel + +it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install +as ``pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` locally or on any other machine. + +Again, remember to ensure to adjust ``TORCH_CUDA_ARCH_LIST`` to the target architectures. + +You can find the complete list of NVIDIA GPUs and their corresponding **Compute Capabilities** (same as arch in this +context) `here `__. + +You can check the archs pytorch was built with using: + +.. code-block:: bash + + python -c "import torch; print(torch.cuda.get_arch_list())" + +Here is how to find out the arch for one of the installed GPU. For example, for GPU 0: + +.. code-block:: bash + + CUDA_VISIBLE_DEVICES=0 python -c "import torch; \ + print(torch.cuda.get_device_properties(torch.device('cuda')))" + +If the output is: + +.. code-block:: bash + + _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82) + +then you know that this card's arch is ``8.6``. + +You can also leave ``TORCH_CUDA_ARCH_LIST`` out completely and then the build program will automatically query the +architecture of the GPUs the build is made on. This may or may not match the GPUs on the target machines, that's why +it's best to specify the desired archs explicitly. + +If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of +`Deepspeed `__, + + + +.. _deepspeed-multi-gpu: + +Deployment with multiple GPUs +======================================================================================================================= + +To deploy this feature with multiple GPUs adjust the :class:`~transformers.Trainer` command line arguments as +following: + +1. replace ``python -m torch.distributed.launch`` with ``deepspeed``. +2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file as + documented `here `__. The file naming is up to you. + +Therefore, if your original command line looked as following: + +.. code-block:: bash + + python -m torch.distributed.launch --nproc_per_node=2 your_program.py + +Now it should be: + +.. code-block:: bash + + deepspeed --num_gpus=2 your_program.py --deepspeed ds_config.json + +Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with the +``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used. The +full details on how to configure various nodes and GPUs can be found `here +`__. + +In fact, you can continue using ``-m torch.distributed.launch`` with DeepSpeed as long as you don't need to use +``deepspeed`` launcher-specific arguments. Typically if you don't need a multi-node setup you're not required to use +the ``deepspeed`` launcher. But since in the DeepSpeed documentation it'll be used everywhere, for consistency we will +use it here as well. + +Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs: + +.. code-block:: bash + + deepspeed examples/pytorch/translation/run_translation.py \ + --deepspeed tests/deepspeed/ds_config_zero3.json \ + --model_name_or_path t5-small --per_device_train_batch_size 1 \ + --output_dir output_dir --overwrite_output_dir --fp16 \ + --do_train --max_train_samples 500 --num_train_epochs 1 \ + --dataset_name wmt16 --dataset_config "ro-en" \ + --source_lang en --target_lang ro + + +Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` - i.e. +two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal +with, we combined the two into a single argument. + +For some practical usage examples, please, see this `post +`__. + + + +.. _deepspeed-one-gpu: + +Deployment with one GPU +======================================================================================================================= + +To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` command line arguments as following: + +.. code-block:: bash + + deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ + --deepspeed tests/deepspeed/ds_config_zero2.json \ + --model_name_or_path t5-small --per_device_train_batch_size 1 \ + --output_dir output_dir --overwrite_output_dir --fp16 \ + --do_train --max_train_samples 500 --num_train_epochs 1 \ + --dataset_name wmt16 --dataset_config "ro-en" \ + --source_lang en --target_lang ro + +This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU via +``--num_gpus=1``. By default, DeepSpeed deploys all GPUs it can see on the given node. If you have only 1 GPU to start +with, then you don't need this argument. The following `documentation +`__ discusses the launcher options. + +Why would you want to use DeepSpeed with just one GPU? + +1. It has a ZeRO-offload feature which can delegate some computations and memory to the host's CPU and RAM, and thus + leave more GPU resources for model's needs - e.g. larger batch size, or enabling a fitting of a very big model which + normally won't fit. +2. It provides a smart GPU memory management system, that minimizes memory fragmentation, which again allows you to fit + bigger models and data batches. + +While we are going to discuss the configuration in details next, the key to getting a huge improvement on a single GPU +with DeepSpeed is to have at least the following configuration in the configuration file: + +.. code-block:: json + + { + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "overlap_comm": true, + "contiguous_gradients": true, + "cpu_offload": true + } + } + +which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will +find more details in the discussion below. + +For a practical usage example of this type of deployment, please, see this `post +`__. + +You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document. + + + +Notes: + +- if you need to run on a specific GPU, which is different from GPU 0, you can't use ``CUDA_VISIBLE_DEVICES`` to limit + the visible scope of available GPUs. Instead, you have to use the following syntax: + + .. code-block:: bash + + deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ... + + In this example, we tell DeepSpeed to use GPU 1 (second gpu). + + + +.. _deepspeed-notebook: + +Deployment in Notebooks +======================================================================================================================= + +The problem with running notebook cells as a script is that there is no normal ``deepspeed`` launcher to rely on, so +under certain setups we have to emulate it. + +If you're using only 1 GPU, here is how you'd have to adjust your training code in the notebook to use DeepSpeed. + +.. code-block:: python + + # DeepSpeed requires a distributed environment even when only one process is used. + # This emulates a launcher in the notebook + import os + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use + os.environ['RANK'] = "0" + os.environ['LOCAL_RANK'] = "0" + os.environ['WORLD_SIZE'] = "1" + + # Now proceed as normal, plus pass the deepspeed config file + training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json") + trainer = Trainer(...) + trainer.train() + +Note: ``...`` stands for the normal arguments that you'd pass to the functions. + +If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. That is, you have +to use the launcher for that purpose and this cannot be accomplished by emulating the distributed environment presented +at the beginning of this section. + +If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated +cell with: + +.. code-block:: python + + %%bash + cat <<'EOT' > ds_config_zero3.json + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + } + EOT + + +If the training script is in a normal file and not in the notebook cells, you can launch ``deepspeed`` normally via +shell from a cell. For example, to use ``run_translation.py`` you would launch it with: + +.. code-block:: + + !git clone https://github.com/huggingface/transformers + !cd transformers; deepspeed examples/pytorch/translation/run_translation.py ... + +or with ``%%bash`` magic, where you can write a multi-line code for the shell program to run: + +.. code-block:: + + %%bash + + git clone https://github.com/huggingface/transformers + cd transformers + deepspeed examples/pytorch/translation/run_translation.py ... + +In such case you don't need any of the code presented at the beginning of this section. + +Note: While ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process +completes. + + + + +.. _deepspeed-config: + +Configuration +======================================================================================================================= + +For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer +to the `following documentation `__. + +You can find dozens of DeepSpeed configuration examples that address various practical needs in `the DeepSpeedExamples +repo `__: + +.. code-block:: bash + + git clone https://github.com/microsoft/DeepSpeedExamples + cd DeepSpeedExamples + find . -name '*json' + +Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the +example ``.json`` files with: + +.. code-block:: bash + + grep -i Lamb $(find . -name '*json') + +Some more examples are to be found in the `main repo `__ as well. + +When using DeepSpeed you always need to supply a DeepSpeed configuration file, yet some configuration parameters have +to be configured via the command line. You will find the nuances in the rest of this guide. + +To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features, +including optimizer states cpu offload, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler and will enable mixed +precision training if ``--fp16`` is passed: + +.. code-block:: json + + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + } + +When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer` +to the console, so you can see exactly what was the final configuration passed to it. + + + +.. _deepspeed-config-passing: + +Passing Configuration +======================================================================================================================= + +As discussed in this document normally the DeepSpeed configuration is passed as a path to a json file, but if you're +not using the command line interface to configure the training, and instead instantiate the +:class:`~transformers.Trainer` via :class:`~transformers.TrainingArguments` then for the ``deepspeed`` argument you can +pass a nested ``dict``. This allows you to create the configuration on the fly and doesn't require you to write it to +the file system before passing it to :class:`~transformers.TrainingArguments`. + +To summarize you can do: + +.. code-block:: python + + TrainingArguments(..., deespeed="/path/to/ds_config.json") + +or: + +.. code-block:: python + + ds_config_dict=dict(scheduler=scheduler_params, optimizer=optimizer_params) + TrainingArguments(..., deespeed=ds_config_dict) + + + +.. _deepspeed-config-shared: + +Shared Configuration +======================================================================================================================= + + +.. warning:: + + This section is a must-read + +Some configuration values are required by both the :class:`~transformers.Trainer` and DeepSpeed to function correctly, +therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to configure those +via the :class:`~transformers.Trainer` command line arguments. + +Additionally, some configuration values are derived automatically based on the model's configuration, so instead of +remembering to manually adjust multiple values, it's the best to let the :class:`~transformers.Trainer` do the majority +of configuration for you. + +Therefore, in the rest of this guide you will find a special configuration value: ``auto``, which when set will be +automatically replaced with the correct or most efficient value. Please feel free to choose to ignore this +recommendation and set the values explicitly, in which case be very careful that your the +:class:`~transformers.Trainer` arguments and DeepSpeed configurations agree. For example, are you using the same +learning rate, or batch size, or gradient accumulation settings? if these mismatch the training may fail in very +difficult to detect ways. You have been warned. + +There are multiple other values that are specific to DeepSpeed-only and those you will have to set manually to suit +your needs. + + + +.. _deepspeed-zero: + +ZeRO +======================================================================================================================= + +`Zero Redundancy Optimizer (ZeRO) `__ is the workhorse of DeepSpeed. It +support 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes, +therefore this document focuses on stages 2 and 3. Stage 3 is further improved by the latest addition of ZeRO-Infinity. +You will find more indepth information in the DeepSpeed documentation. + +The ``zero_optimization`` section of the configuration file is the most important part (`docs +`__), since that is where you define +which ZeRO stages you want to enable and how to configure them. You will find the explanation for each parameter in the +DeepSpeed docs. + +This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides +no equivalent command line arguments. + +Note: currently DeepSpeed doesn't validate parameter names, so if you misspell any, it'll use the default setting for +the parameter that got misspelled. You can watch the DeepSpeed engine start up log messages to see what values it is +going to use. + + + +.. _deepspeed-zero2-config: + +ZeRO-2 Config ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +The following is an example configuration for ZeRO stage 2: + +.. code-block:: json + + { + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients": true, + "cpu_offload": true + } + } + +**Performance tuning:** + +- enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``) +- ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x + the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB + footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting + OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB. You will want to do + the same on larger capacity GPU as well, if you're starting to hit OOM. +- when reducing these buffers you're trading communication speed to avail more GPU RAM. The smaller the buffer size, + the slower the communication, and the more GPU RAM will be available to other tasks. So if a bigger batch size is + important, getting a slightly slower training time could be a good trade. + + + +.. _deepspeed-zero3-config: + +ZeRO-3 Config ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +The following is an example configuration for ZeRO stage 3: + +.. code-block:: json + + { + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + } + } + +If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU +memory offloading the optimizer states and parameters to CPU memory with ``"device": "cpu"`` may solve this limitation. +If you don't want to offload to CPU memory, use ``none`` instead of ``cpu`` for the ``device`` entry. Offloading to +NVMe is discussed further down. + +Pinned memory is enabled with ``pin_memory`` set to ``true``. This feature can improve the throughput at the cost of +making less memory available to other processes. Pinned memory is set aside to the specific process that requested it +and its typically accessed much faster than normal CPU memory. + +**Performance tuning:** + +- ``sub_group_size``: ``1e14`` +- ``stage3_max_live_parameters``: ``1e9`` +- ``stage3_max_reuse_distance``: ``1e9`` + +If hitting OOM reduce ``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``. They should have minimal impact +on performance unless you are doing activation checkpointing. ``1e9`` would consume ~2GB. The memory is shared by +``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``, so its not additive, its just 2GB total. + +``stage3_max_live_parameters`` is the upper limit on how many full parameters you want to keep on the GPU at any given +time. "reuse distance" is a metric we are using to figure out when will a parameter be used again in the future, and we +use the ``stage3_max_reuse_distance`` to decide whether to throw away the parameter or to keep it. If a parameter is +going to be used again in near future (less than ``stage3_max_reuse_distance``) then we keep it to reduce communication +overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and +backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward + +The following configuration values depend on the model's hidden size: + +- ``reduce_bucket_size``: ``hidden_size*hidden_size`` +- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size`` +- ``stage3_param_persistence_threshold``: ``10 * hidden_size`` + +therefore set these values to ``auto`` and the :class:`~transformers.Trainer` will automatically assign the recommended +values. But, of course, feel free to set these explicitly as well. + +``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large +models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if +you plan to resume the training. Watch out for future updates that will remove this limitation and make things more +flexible. + +If you're migrating from ZeRO-2 configuration note that ``allgather_partitions``, ``allgather_bucket_size`` and +``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just +be ignored. Make sure to remove ``cpu_offload`` though, since it has been deprecated in ZeRO-3. + + + +.. _deepspeed-nvme: + +NVMe Support +======================================================================================================================= + +ZeRO-Infinity allows for training incredibly large models by extending GPU and CPU memory with NVMe memory. Thanks to +smart partitioning and tiling algorithms each GPU needs to send and receive very small amounts of data during +offloading so modern NVMe proved to be fit to allow for an even larger total memory pool available to your training +process. ZeRO-Infinity requires ZeRO-3 enabled. + +The following configuration example enables NVMe to offload both optimizer states and the params: + +.. code-block:: json + + { + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "nvme", + "nvme_path": "/local_nvme", + "pin_memory": true, + "buffer_count": 4, + "fast_init": false + }, + "offload_param": { + "device": "nvme", + "nvme_path": "/local_nvme", + "pin_memory": true, + "buffer_count": 5, + "buffer_size": 1e8, + "max_in_cpu": 1e9 + } + "aio": { + "block_size": 262144, + "queue_depth": 32, + "thread_count": 1, + "single_submit": false, + "overlap_events": true + } + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + } + +You can choose to offload both optimizer states and params to NVMe, or just one of them or none. For example, if you +have copious amounts of CPU memory available, by all means offload to CPU memory only as it'd be faster (hint: +`"device": "cpu"`). + +Here is the full documentation for offloading `optimizer states +`__ and `parameters +`__. + +Make sure that your ``nvme_path`` is actually an NVMe, since it will work with the normal hard drive or SSD, but it'll +be much much slower. The fast scalable training was designed with modern NVMe transfer speeds in mind (as of this +writing one can have ~3.5GB/s read, ~3GB/s write peak speeds). + +In order to figure out the optimal ``aio`` configuration block you must run a benchmark on your target setup, as +`explained here `__. + + + +.. _deepspeed-zero2-zero3-performance: + +ZeRO-2 vs ZeRO-3 Performance ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +ZeRO-3 is likely to be slower than ZeRO-2 if everything else is configured the same because the former has to gather +model weights in addition to what ZeRO-2 does. If ZeRO-2 meets your needs and you don't need to scale beyond a few GPUs +then you may choose to stick to it. It's important to understand that ZeRO-3 enables a much higher scalability capacity +at a cost of speed. + +It's possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2: + +- set ``stage3_param_persistence_threshold`` to a very large number - larger than the largest parameter, e.g., ``6 * + hidden_size * hidden_size``. This will keep the parameters on the GPUs. +- turn off ``cpu_offload_params`` since ZeRO-2 doesn't have that option. + +The performance will likely improve significantly with just ``cpu_offload_params`` turned off, even if you don't change +``stage3_param_persistence_threshold``. Of course, these changes will impact the size of the model you can train. So +these help you to trade scalability for speed depending on your needs. + + + +.. _deepspeed-zero2-example: + +ZeRO-2 Example ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2.json``: + +.. code-block:: json + + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + } + + +Here is a full ZeRO-2 all-enabled manually set configuration file. It is here mainly for you to see what the typical +values look like, but we highly recommend using the one with multiple ``auto`` settings in it. + +.. code-block:: json + + { + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": 3e-5, + "betas": [0.8, 0.999], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 3e-5, + "warmup_num_steps": 500 + } + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "steps_per_print": 2000, + "wall_clock_breakdown": false + } + + + +.. _deepspeed-zero3-example: + +ZeRO-3 Example ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3.json``: + + +.. code-block:: json + + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + } + +Here is a full ZeRO-3 all-enabled manually set configuration file. It is here mainly for you to see what the typical +values look like, but we highly recommend using the one with multiple ``auto`` settings in it. + +.. code-block:: json + + { + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": 3e-5, + "betas": [0.8, 0.999], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 3e-5, + "warmup_num_steps": 500 + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": 1e6, + "stage3_prefetch_bucket_size": 0.94e6, + "stage3_param_persistence_threshold": 1e4, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + + "steps_per_print": 2000, + "wall_clock_breakdown": false + } + + +Optimizer and Scheduler +======================================================================================================================= + +As long as you don't enable ``cpu_offload`` you can mix and match DeepSpeed and HuggingFace schedulers and optimizers, +with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer: + ++--------------+--------------+--------------+ +| Combos | HF Scheduler | DS Scheduler | ++--------------+--------------+--------------+ +| HF Optimizer | Yes | Yes | ++--------------+--------------+--------------+ +| DS Optimizer | No | Yes | ++--------------+--------------+--------------+ + +If ``cpu_offload`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer. + + + + +.. _deepspeed-optimizer: + +Optimizer ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + +DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are +thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here +`__. + +If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will +automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line +arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``. + +Here is an example of the auto-configured ``optimizer`` entry for ``AdamW``: + +.. code-block:: json + + { + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + } + } + + +Note that the command line arguments will set the values in the configuration file. This is so that there is one +definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to +different values in different places. Command line rules. The values that get overridden are: + +- ``lr`` with the value of ``--learning_rate`` +- ``betas`` with the value of ``--adam_beta1 --adam_beta2`` +- ``eps`` with the value of ``--adam_epsilon`` +- ``weight_decay`` with the value of ``--weight_decay`` + +Therefore please remember to tune the shared hyperparameters on the command line. + +You can also set the values explicitly: + +.. code-block:: json + + { + "optimizer": { + "type": "AdamW", + "params": { + "lr": 0.001, + "betas": [0.8, 0.999], + "eps": 1e-8, + "weight_decay": 3e-7 + } + } + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + +If you want to use another optimizer which is not listed above, you will have to add to the top level configuration. + +.. code-block:: json + + { + "zero_allow_untested_optimizer": true + } + +Similarly to ``AdamW``, you can configure other officially supported optimizers. Just remember that may have different +config values. e.g. for Adam you will want ``weight_decay`` around ``0.01``. + + + +.. _deepspeed-scheduler: + +Scheduler ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +DeepSpeed supports ``LRRangeTest``, ``OneCycle``, ``WarmupLR`` and ``WarmupDecayLR`` learning rate schedulers. The full +documentation is `here `__. + +Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed: + +* ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup`` +* ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``, + therefore, if you don't configure the scheduler this is scheduler that will get configured by default. + +If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use +the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version +of it. + +Here is an example of the auto-configured ``scheduler`` entry for ``WarmupLR``: + +.. code-block:: json + + { + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + } + } + +Since `"auto"` is used the :class:`~transformers.Trainer` arguments will set the correct values in the configuration +file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example, +the learning rate is set to different values in different places. Command line rules. The values that get set are: + +- ``warmup_min_lr`` with the value of ``0`` +- ``warmup_max_lr`` with the value of ``--learning_rate`` +- ``warmup_num_steps`` with the value of ``--warmup_steps`` +- ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run + time based on the environment and the size of the dataset and other command line arguments (needed for + ``WarmupDecayLR``). + +You can, of course, take over any or all of the configuration values and set those yourself: + +.. code-block:: json + + { + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000 + } + } + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + +For example, for ``WarmupDecayLR``, you can use the following entry: + +.. code-block:: json + + { + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "last_batch_iteration": -1, + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + } + } + +and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be set at loading time. + + + + +.. _deepspeed-fp32: + +fp32 Precision +======================================================================================================================= + +Deepspeed supports the full fp32 and the fp16 mixed precision. + +Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you +will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this +happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained +models). Such models may overflow or underflow leading to ``NaN`` loss. If this is your case then you will want to use +the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with: + +.. code-block:: json + + { + "fp16": { + "enabled": "false", + } + } + +If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using +the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and +benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices +`__. The document includes +instructions on how to disable this automatic conversion if for some reason you prefer not to use it. + + + + +.. _deepspeed-amp: + +Automatic Mixed Precision +======================================================================================================================= + +You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way: + +To configure pytorch AMP-like mode set: + +.. code-block:: json + + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } + } + +and the :class:`~transformers.Trainer` will automatically enable or disable it based on the value of +``args.fp16_backend``. The rest of config values are up to you. + +This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed. + +You can also enable/disable this mode explicitly: + +.. code-block:: json + + { + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + +Here is the `documentation `__. + +To configure apex AMP-like mode set: + +.. code-block:: json + + "amp": { + "enabled": "auto", + "opt_level": "auto" + } + +and the :class:`~transformers.Trainer` will automatically configure it based on the values of ``args.fp16_backend`` and +``args.fp16_opt_level``. + +This mode gets enabled when ``--fp16 --fp16_backend apex --fp16_opt_level 01`` command line args are passed. + +You can also configure this mode explicitly: + +.. code-block:: json + + { + "amp": { + "enabled": true, + "opt_level": "O1" + } + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + +Here is the `documentation +`__. + + + +.. _deepspeed-bs: + +Batch Size +======================================================================================================================= + +To configure batch size, use: + +.. code-block:: json + + { + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto" + } + +and the :class:`~transformers.Trainer` will automatically set ``train_micro_batch_size_per_gpu`` to the value of +``args.per_device_train_batch_size`` and ``train_batch_size`` to ``args.world_size * args.per_device_train_batch_size * +args.gradient_accumulation_steps``. + +You can also set the values explicitly: + +.. code-block:: json + + { + "train_batch_size": 12, + "train_micro_batch_size_per_gpu": 4 + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + + + +.. _deepspeed-grad-acc: + +Gradient Accumulation +======================================================================================================================= + +To configure gradient accumulation set: + +.. code-block:: json + + { + "gradient_accumulation_steps": "auto" + } + +and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.gradient_accumulation_steps``. + +You can also set the value explicitly: + +.. code-block:: json + + { + "gradient_accumulation_steps": 3 + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + + + +.. _deepspeed-grad-clip: + +Gradient Clipping +======================================================================================================================= + +To configure gradient gradient clipping set: + +.. code-block:: json + + { + "gradient_clipping": "auto" + } + +and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.max_grad_norm``. + +You can also set the value explicitly: + +.. code-block:: json + + { + "gradient_clipping": 1.0 + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + + + +.. _deepspeed-weight-extraction: + +Getting The Model Weights Out +======================================================================================================================= + +As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores +fp32 master weights in its custom checkpoint optimizer files, which are ``global_step*/*optim_states.pt`` (this is glob +pattern), and are saved under the normal checkpoint. + +**FP16 Weights:** + +When a model is saved under ZeRO-2, you end up having the normal ``pytorch_model.bin`` file with the model weights, but +they are only the fp16 version of the weights. + +Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs, +therefore ``"stage3_gather_fp16_weights_on_model_save": true`` is required to get the ``Trainer`` to save the fp16 +version of the weights. If this setting is ``False`` ``pytorch_model.bin`` won't be created. This is because by default +DeepSpeed's ``state_dict`` contains a placeholder and not the real weights. If we were to save this ``state_dict`` it +won't be possible to load it back. + + +.. code-block:: json + + { + "zero_optimization": { + "stage3_gather_fp16_weights_on_model_save": true + } + } + + +**FP32 Weights:** + +While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to +the `models hub `__ or pass it to someone else you most likely will want to get the fp32 +weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this +is performed offline. + +DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint +folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to +have the configuration file or a ``Trainer`` to do the extraction. + +Let's say your checkpoint folder looks like this: + +.. code-block:: bash + + $ ls -l output_dir/checkpoint-1/ + -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json + drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/ + -rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest + -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt + -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin + -rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt + -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json + -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model + -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json + -rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json + -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin + -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py* + +In this example there is just one DeepSpeed checkpoint sub-folder `global_step1`. Therefore to reconstruct the fp32 +weights just run: + +.. code-block:: bash + + python zero_to_fp32.py global_step1 pytorch_model.bin + +The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint. + +``python zero_to_fp32.py -h`` will give you usage details. + +If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights. + +This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs. + +Note: currently the script requires 2x general RAM of the final fp32 model weights. + + +ZeRO-3 and Infinity Nuances +======================================================================================================================= + +ZeRO-3 is quite different from ZeRO-2 because of its param sharding feature. + +ZeRO-Infinity further extends ZeRO-3 to support NVMe memory and multiple other speed and scalability improvements. + +While all the efforts were made for things to just work without needing any special changes to your models, in certain +circumstances you may find the following information to be needed. + + + +Constructing Massive Models ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +DeepSpeed/ZeRO-3 can handle models with Trillions of parameters which may not fit onto the existing RAM. In such cases, +but also if you want the initialization to happen much faster, initialize the model using `deepspeed.zero.Init()` +context manager (which is also a function decorator), like so: + +.. code-block:: python + + from transformers import T5ForConditionalGeneration, T5Config + import deepspeed + with deepspeed.zero.Init(): + config = T5Config.from_pretrained("t5-small") + model = T5ForConditionalGeneration(config) + +As you can see this gives you a randomly initialized model. + +If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as +``is_deepspeed_zero3_enabled()`` returns ``True``, which currently is setup by the +class:`~transformers.TrainingArguments` object if the passed DeepSpeed configuration file contains ZeRO-3 config +section. Thus you must create the :class:`~transformers.TrainingArguments` object **before** calling +``from_pretrained``. Here is an example of a possible sequence: + +.. code-block:: python + + from transformers import AutoModel, Trainer, TrainingArguments + training_args = TrainingArguments(..., deepspeed=ds_config) + model = AutoModel.from_pretrained("t5-small") + trainer = Trainer(model=model, args=training_args, ...) + +If you're using the official example scripts and your command line arguments include ``--deepspeed ds_config.json`` +with ZeRO-3 config enabled, then everything is already done for you, since this is how example scripts are written. + +Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used. + +For full details on this method and other related features please refer to `Constructing Massive Models +`__. + + + +Gathering Parameters ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Under ZeRO-3 on multiple GPUs no single GPU has all the parameters unless it's the parameters for the currently +executing layer. So if you need to access all parameters from all layers at once there is a specific method to do it. +Most likely you won't need it, but if you do please refer to `Gathering Parameters +`__ + +We do however use it internally in several places, one such example is when loading pretrained model weights in +``from_pretrained``. We load one layer at a time and immediately partition it to all participating GPUs, as for very +large models it won't be possible to load it on one GPU and then spread it out to multiple GPUs, due to memory +limitations. + +Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like: + +.. code-block:: python + + tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True) + +stress on ``tensor([1.])``, or if you get an error where it says the parameter is of size ``1``, instead of some much +larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder. + + +Troubleshooting +======================================================================================================================= + +* ``deepspeed`` process gets killed at startup without a traceback + +If the ``deepspeed`` process gets killed at launch time without a traceback, that usually means that the program tried +to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that +process. This is because your configuration file most likely has either ``offload_optimizer`` or ``offload_param`` or +both configured to offload to ``cpu`` (or under ZeRO-2 ``cpu_offload`` is enabled). If you have NVMe, experiment with +offloading to NVMe if you're running under ZeRO-3. + +Work is being done to enable estimating how much memory is needed for a specific model: `PR +`__. + + + + + + +Notes +======================================================================================================================= + +* DeepSpeed works with the PyTorch :class:`~transformers.Trainer` but not TF :class:`~transformers.TFTrainer`. +* While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source + `__ to best match your hardware and also if you need to enable + certain features, like 1-bit Adam, which aren't available in the pypi distribution. +* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with 🤗 Transformers - you can use any model + with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration instructions + `__. + + + + +.. _deepspeed-non-trainer-integration: + +Non-Trainer Deepspeed Integration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + The :class:`~transformers.integrations.HfDeepSpeedConfig` is used to integrate Deepspeed into the 🤗 Transformer core functionality, when :class:`~transformers.Trainer` is not used. @@ -25,7 +1581,7 @@ For example for a pretrained model: .. code-block:: python - from transformers.integrations import HfDeepSpeedConfig + from transformers.deepspeed import HfDeepSpeedConfig from transformers import AugoModel ds_config = { ... } # deepspeed config object or path to the file @@ -38,7 +1594,7 @@ or for non-pretrained model: .. code-block:: python - from transformers.integrations import HfDeepSpeedConfig + from transformers.deepspeed import HfDeepSpeedConfig from transformers import AugoModel, AutoConfig ds_config = { ... } # deepspeed config object or path to the file @@ -50,7 +1606,27 @@ or for non-pretrained model: HfDeepSpeedConfig -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autoclass:: transformers.integrations.HfDeepSpeedConfig +.. autoclass:: transformers.deepspeed.HfDeepSpeedConfig :members: + + + +Main DeepSpeed Resources +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- `Project's github `__ +- `Usage docs `__ +- `API docs `__ +- `Blog posts `__ + +Papers: + +- `ZeRO: Memory Optimizations Toward Training Trillion Parameter Models `__ +- `ZeRO-Offload: Democratizing Billion-Scale Model Training `__ +- `ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning `__ + +Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you +have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub +`__. diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index 674f2ce617..d702605f2e 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -150,7 +150,7 @@ This provided support is new and experimental as of this writing. .. _zero-install-notes: -Installation Notes +CUDA Extension Installation Notes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ As of this writing, both FairScale and Deepspeed require compilation of CUDA C++ code, before they can be used. @@ -411,1496 +411,131 @@ Known caveats: DeepSpeed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -`DeepSpeed `__ implements everything described in the `ZeRO paper -`__. Currently it provides full support for: - -1. Optimizer state partitioning (ZeRO stage 1) -2. Gradient partitioning (ZeRO stage 2) -3. Parameter partitioning (ZeRO stage 3) -4. Custom mixed precision training handling -5. A range of fast CUDA-extension-based optimizers -6. ZeRO-Offload to CPU and NVMe - -ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training -`__. And NVMe-support is described in the paper `ZeRO-Infinity: Breaking the GPU -Memory Wall for Extreme Scale Deep Learning `__. - -DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference. - -DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which -won't be possible on a single GPU. +Moved to :ref:`deepspeed-trainer-integration`. Installation ======================================================================================================================= -Install the library via pypi: - -.. code-block:: bash - - pip install deepspeed - -or via ``transformers``' ``extras``: - -.. code-block:: bash - - pip install transformers[deepspeed] - -(will become available starting from ``transformers==4.6.0``) - -or find more details on `the DeepSpeed's GitHub page `__ and -`advanced install `__. - -If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`. - -If you don't prebuild the extensions and rely on them to be built at run time and you tried all of the above solutions -to no avail, the next thing to try is to pre-build the modules before installing them. - -To make a local build for DeepSpeed: - -.. code-block:: bash - - git clone https://github.com/microsoft/DeepSpeed/ - cd DeepSpeed - rm -rf build - TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \ - --global-option="build_ext" --global-option="-j8" --no-cache -v \ - --disable-pip-version-check 2>&1 | tee build.log - -Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use. - -Or if you need to use the same setup on multiple machines, make a binary wheel: - -.. code-block:: bash - - git clone https://github.com/microsoft/DeepSpeed/ - cd DeepSpeed - rm -rf build - TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \ - python setup.py build_ext -j8 bdist_wheel - -it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install -as ``pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` locally or on any other machine. - -Again, remember to ensure to adjust ``TORCH_CUDA_ARCH_LIST`` to the target architectures. - -You can find the complete list of NVIDIA GPUs and their corresponding **Compute Capabilities** (same as arch in this -context) `here `__. - -You can check the archs pytorch was built with using: - -.. code-block:: bash - - python -c "import torch; print(torch.cuda.get_arch_list())" - -Here is how to find out the arch for one of the installed GPU. For example, for GPU 0: - -.. code-block:: bash - - CUDA_VISIBLE_DEVICES=0 python -c "import torch; \ - print(torch.cuda.get_device_properties(torch.device('cuda')))" - -If the output is: - -.. code-block:: bash - - _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82) - -then you know that this card's arch is ``8.6``. - -You can also leave ``TORCH_CUDA_ARCH_LIST`` out completely and then the build program will automatically query the -architecture of the GPUs the build is made on. This may or may not match the GPUs on the target machines, that's why -it's best to specify the desired archs explicitly. - -If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of -`Deepspeed `__, - +Moved to :ref:`deepspeed-installation`. Deployment with multiple GPUs ======================================================================================================================= -To deploy this feature with multiple GPUs adjust the :class:`~transformers.Trainer` command line arguments as -following: - -1. replace ``python -m torch.distributed.launch`` with ``deepspeed``. -2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file as - documented `here `__. The file naming is up to you. - -Therefore, if your original command line looked as following: - -.. code-block:: bash - - python -m torch.distributed.launch --nproc_per_node=2 your_program.py - -Now it should be: - -.. code-block:: bash - - deepspeed --num_gpus=2 your_program.py --deepspeed ds_config.json - -Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with the -``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used. The -full details on how to configure various nodes and GPUs can be found `here -`__. - -In fact, you can continue using ``-m torch.distributed.launch`` with DeepSpeed as long as you don't need to use -``deepspeed`` launcher-specific arguments. Typically if you don't need a multi-node setup you're not required to use -the ``deepspeed`` launcher. But since in the DeepSpeed documentation it'll be used everywhere, for consistency we will -use it here as well. - -Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs: - -.. code-block:: bash - - deepspeed examples/pytorch/translation/run_translation.py \ - --deepspeed tests/deepspeed/ds_config_zero3.json \ - --model_name_or_path t5-small --per_device_train_batch_size 1 \ - --output_dir output_dir --overwrite_output_dir --fp16 \ - --do_train --max_train_samples 500 --num_train_epochs 1 \ - --dataset_name wmt16 --dataset_config "ro-en" \ - --source_lang en --target_lang ro - - -Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` - i.e. -two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal -with, we combined the two into a single argument. - -For some practical usage examples, please, see this `post -`__. - +Moved to :ref:`deepspeed-multi-gpu`. Deployment with one GPU ======================================================================================================================= -To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` command line arguments as following: - -.. code-block:: bash - - deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ - --deepspeed tests/deepspeed/ds_config_zero2.json \ - --model_name_or_path t5-small --per_device_train_batch_size 1 \ - --output_dir output_dir --overwrite_output_dir --fp16 \ - --do_train --max_train_samples 500 --num_train_epochs 1 \ - --dataset_name wmt16 --dataset_config "ro-en" \ - --source_lang en --target_lang ro - -This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU via -``--num_gpus=1``. By default, DeepSpeed deploys all GPUs it can see on the given node. If you have only 1 GPU to start -with, then you don't need this argument. The following `documentation -`__ discusses the launcher options. - -Why would you want to use DeepSpeed with just one GPU? - -1. It has a ZeRO-offload feature which can delegate some computations and memory to the host's CPU and RAM, and thus - leave more GPU resources for model's needs - e.g. larger batch size, or enabling a fitting of a very big model which - normally won't fit. -2. It provides a smart GPU memory management system, that minimizes memory fragmentation, which again allows you to fit - bigger models and data batches. - -While we are going to discuss the configuration in details next, the key to getting a huge improvement on a single GPU -with DeepSpeed is to have at least the following configuration in the configuration file: - -.. code-block:: json - - { - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 2e8, - "reduce_scatter": true, - "reduce_bucket_size": 2e8, - "overlap_comm": true, - "contiguous_gradients": true, - "cpu_offload": true - } - } - -which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will -find more details in the discussion below. - -For a practical usage example of this type of deployment, please, see this `post -`__. - -You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document. - - - -Notes: - -- if you need to run on a specific GPU, which is different from GPU 0, you can't use ``CUDA_VISIBLE_DEVICES`` to limit - the visible scope of available GPUs. Instead, you have to use the following syntax: - - .. code-block:: bash - - deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ... - - In this example, we tell DeepSpeed to use GPU 1 (second gpu). - +Moved to :ref:`deepspeed-one-gpu`. Deployment in Notebooks ======================================================================================================================= -The problem with running notebook cells as a script is that there is no normal ``deepspeed`` launcher to rely on, so -under certain setups we have to emulate it. - -If you're using only 1 GPU, here is how you'd have to adjust your training code in the notebook to use DeepSpeed. - -.. code-block:: python - - # DeepSpeed requires a distributed environment even when only one process is used. - # This emulates a launcher in the notebook - import os - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use - os.environ['RANK'] = "0" - os.environ['LOCAL_RANK'] = "0" - os.environ['WORLD_SIZE'] = "1" - - # Now proceed as normal, plus pass the deepspeed config file - training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json") - trainer = Trainer(...) - trainer.train() - -Note: ``...`` stands for the normal arguments that you'd pass to the functions. - -If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. That is, you have -to use the launcher for that purpose and this cannot be accomplished by emulating the distributed environment presented -at the beginning of this section. - -If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated -cell with: - -.. code-block:: python - - %%bash - cat <<'EOT' > ds_config_zero3.json - { - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - }, - - "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "offload_param": { - "device": "cpu", - "pin_memory": true - }, - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e14, - "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_fp16_weights_on_model_save": true - }, - - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "steps_per_print": 2000, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false - } - EOT - - -If the training script is in a normal file and not in the notebook cells, you can launch ``deepspeed`` normally via -shell from a cell. For example, to use ``run_translation.py`` you would launch it with: - -.. code-block:: - - !git clone https://github.com/huggingface/transformers - !cd transformers; deepspeed examples/pytorch/translation/run_translation.py ... - -or with ``%%bash`` magic, where you can write a multi-line code for the shell program to run: - -.. code-block:: - - %%bash - - git clone https://github.com/huggingface/transformers - cd transformers - deepspeed examples/pytorch/translation/run_translation.py ... - -In such case you don't need any of the code presented at the beginning of this section. - -Note: While ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process -completes. - - - +Moved to :ref:`deepspeed-notebook`. Configuration ======================================================================================================================= -For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer -to the `following documentation `__. - -You can find dozens of DeepSpeed configuration examples that address various practical needs in `the DeepSpeedExamples -repo `__: - -.. code-block:: bash - - git clone https://github.com/microsoft/DeepSpeedExamples - cd DeepSpeedExamples - find . -name '*json' - -Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the -example ``.json`` files with: - -.. code-block:: bash - - grep -i Lamb $(find . -name '*json') - -Some more examples are to be found in the `main repo `__ as well. - -When using DeepSpeed you always need to supply a DeepSpeed configuration file, yet some configuration parameters have -to be configured via the command line. You will find the nuances in the rest of this guide. - -To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features, -including optimizer states cpu offload, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler and will enable mixed -precision training if ``--fp16`` is passed: - -.. code-block:: json - - { - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - }, - - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 2e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 2e8, - "contiguous_gradients": true, - "cpu_offload": true - }, - - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - } - -When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer` -to the console, so you can see exactly what was the final configuration passed to it. +Moved to :ref:`deepspeed-config`. Passing Configuration ======================================================================================================================= -As discussed in this document normally the DeepSpeed configuration is passed as a path to a json file, but if you're -not using the command line interface to configure the training, and instead instantiate the -:class:`~transformers.Trainer` via :class:`~transformers.TrainingArguments` then for the ``deepspeed`` argument you can -pass a nested ``dict``. This allows you to create the configuration on the fly and doesn't require you to write it to -the file system before passing it to :class:`~transformers.TrainingArguments`. - -To summarize you can do: - -.. code-block:: python - - TrainingArguments(..., deespeed="/path/to/ds_config.json") - -or: - -.. code-block:: python - - ds_config_dict=dict(scheduler=scheduler_params, optimizer=optimizer_params) - TrainingArguments(..., deespeed=ds_config_dict) - +Moved to :ref:`deepspeed-config-passing`. Shared Configuration ======================================================================================================================= - -.. warning:: - - This section is a must-read - -Some configuration values are required by both the :class:`~transformers.Trainer` and DeepSpeed to function correctly, -therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to configure those -via the :class:`~transformers.Trainer` command line arguments. - -Additionally, some configuration values are derived automatically based on the model's configuration, so instead of -remembering to manually adjust multiple values, it's the best to let the :class:`~transformers.Trainer` do the majority -of configuration for you. - -Therefore, in the rest of this guide you will find a special configuration value: ``auto``, which when set will be -automatically replaced with the correct or most efficient value. Please feel free to choose to ignore this -recommendation and set the values explicitly, in which case be very careful that your the -:class:`~transformers.Trainer` arguments and DeepSpeed configurations agree. For example, are you using the same -learning rate, or batch size, or gradient accumulation settings? if these mismatch the training may fail in very -difficult to detect ways. You have been warned. - -There are multiple other values that are specific to DeepSpeed-only and those you will have to set manually to suit -your needs. - - +Moved to :ref:`deepspeed-config-shared`. ZeRO ======================================================================================================================= -`Zero Redundancy Optimizer (ZeRO) `__ is the workhorse of DeepSpeed. It -support 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes, -therefore this document focuses on stages 2 and 3. Stage 3 is further improved by the latest addition of ZeRO-Infinity. -You will find more indepth information in the DeepSpeed documentation. - -The ``zero_optimization`` section of the configuration file is the most important part (`docs -`__), since that is where you define -which ZeRO stages you want to enable and how to configure them. You will find the explanation for each parameter in the -DeepSpeed docs. - -This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides -no equivalent command line arguments. - -Note: currently DeepSpeed doesn't validate parameter names, so if you misspell any, it'll use the default setting for -the parameter that got misspelled. You can watch the DeepSpeed engine start up log messages to see what values it is -going to use. - +Moved to :ref:`deepspeed-zero`. ZeRO-2 Config +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -The following is an example configuration for ZeRO stage 2: - -.. code-block:: json - - { - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients": true, - "cpu_offload": true - } - } - -**Performance tuning:** - -- enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``) -- ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x - the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB - footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting - OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB. You will want to do - the same on larger capacity GPU as well, if you're starting to hit OOM. -- when reducing these buffers you're trading communication speed to avail more GPU RAM. The smaller the buffer size, - the slower the communication, and the more GPU RAM will be available to other tasks. So if a bigger batch size is - important, getting a slightly slower training time could be a good trade. - +Moved to :ref:`deepspeed-zero2-config`. ZeRO-3 Config +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -The following is an example configuration for ZeRO stage 3: - -.. code-block:: json - - { - "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "offload_param": { - "device": "cpu", - "pin_memory": true - }, - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e14, - "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_fp16_weights_on_model_save": true - } - } - -If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU -memory offloading the optimizer states and parameters to CPU memory with ``"device": "cpu"`` may solve this limitation. -If you don't want to offload to CPU memory, use ``none`` instead of ``cpu`` for the ``device`` entry. Offloading to -NVMe is discussed further down. - -Pinned memory is enabled with ``pin_memory`` set to ``true``. This feature can improve the throughput at the cost of -making less memory available to other processes. Pinned memory is set aside to the specific process that requested it -and its typically accessed much faster than normal CPU memory. - -**Performance tuning:** - -- ``sub_group_size``: ``1e14`` -- ``stage3_max_live_parameters``: ``1e9`` -- ``stage3_max_reuse_distance``: ``1e9`` - -If hitting OOM reduce ``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``. They should have minimal impact -on performance unless you are doing activation checkpointing. ``1e9`` would consume ~2GB. The memory is shared by -``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``, so its not additive, its just 2GB total. - -``stage3_max_live_parameters`` is the upper limit on how many full parameters you want to keep on the GPU at any given -time. "reuse distance" is a metric we are using to figure out when will a parameter be used again in the future, and we -use the ``stage3_max_reuse_distance`` to decide whether to throw away the parameter or to keep it. If a parameter is -going to be used again in near future (less than ``stage3_max_reuse_distance``) then we keep it to reduce communication -overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and -backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward - -The following configuration values depend on the model's hidden size: - -- ``reduce_bucket_size``: ``hidden_size*hidden_size`` -- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size`` -- ``stage3_param_persistence_threshold``: ``10 * hidden_size`` - -therefore set these values to ``auto`` and the :class:`~transformers.Trainer` will automatically assign the recommended -values. But, of course, feel free to set these explicitly as well. - -``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large -models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if -you plan to resume the training. Watch out for future updates that will remove this limitation and make things more -flexible. - -If you're migrating from ZeRO-2 configuration note that ``allgather_partitions``, ``allgather_bucket_size`` and -``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just -be ignored. Make sure to remove ``cpu_offload`` though, since it has been deprecated in ZeRO-3. - - +Moved to :ref:`deepspeed-zero3-config`. NVMe Support ======================================================================================================================= -ZeRO-Infinity allows for training incredibly large models by extending GPU and CPU memory with NVMe memory. Thanks to -smart partitioning and tiling algorithms each GPU needs to send and receive very small amounts of data during -offloading so modern NVMe proved to be fit to allow for an even larger total memory pool available to your training -process. ZeRO-Infinity requires ZeRO-3 enabled. - -The following configuration example enables NVMe to offload both optimizer states and the params: - -.. code-block:: json - - { - "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "nvme", - "nvme_path": "/local_nvme", - "pin_memory": true, - "buffer_count": 4, - "fast_init": false - }, - "offload_param": { - "device": "nvme", - "nvme_path": "/local_nvme", - "pin_memory": true, - "buffer_count": 5, - "buffer_size": 1e8, - "max_in_cpu": 1e9 - } - "aio": { - "block_size": 262144, - "queue_depth": 32, - "thread_count": 1, - "single_submit": false, - "overlap_events": true - } - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e14, - "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_fp16_weights_on_model_save": true - }, - } - -You can choose to offload both optimizer states and params to NVMe, or just one of them or none. For example, if you -have copious amounts of CPU memory available, by all means offload to CPU memory only as it'd be faster (hint: -`"device": "cpu"`). - -Here is the full documentation for offloading `optimizer states -`__ and `parameters -`__. - -Make sure that your ``nvme_path`` is actually an NVMe, since it will work with the normal hard drive or SSD, but it'll -be much much slower. The fast scalable training was designed with modern NVMe transfer speeds in mind (as of this -writing one can have ~3.5GB/s read, ~3GB/s write peak speeds). - -In order to figure out the optimal ``aio`` configuration block you must run a benchmark on your target setup, as -`explained here `__. - - +Moved to :ref:`deepspeed-nvme`. ZeRO-2 vs ZeRO-3 Performance +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -ZeRO-3 is likely to be slower than ZeRO-2 if everything else is configured the same because the former has to gather -model weights in addition to what ZeRO-2 does. If ZeRO-2 meets your needs and you don't need to scale beyond a few GPUs -then you may choose to stick to it. It's important to understand that ZeRO-3 enables a much higher scalability capacity -at a cost of speed. - -It's possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2: - -- set ``stage3_param_persistence_threshold`` to a very large number - larger than the largest parameter, e.g., ``6 * - hidden_size * hidden_size``. This will keep the parameters on the GPUs. -- turn off ``cpu_offload_params`` since ZeRO-2 doesn't have that option. - -The performance will likely improve significantly with just ``cpu_offload_params`` turned off, even if you don't change -``stage3_param_persistence_threshold``. Of course, these changes will impact the size of the model you can train. So -these help you to trade scalability for speed depending on your needs. - - +Moved to :ref:`deepspeed-zero2-zero3-performance`. ZeRO-2 Example +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2.json``: - -.. code-block:: json - - { - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - }, - - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 2e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 2e8, - "contiguous_gradients": true, - "cpu_offload": true - }, - - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "steps_per_print": 2000, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false - } - - -Here is a full ZeRO-2 all-enabled manually set configuration file. It is here mainly for you to see what the typical -values look like, but we highly recommend using the one with multiple ``auto`` settings in it. - -.. code-block:: json - - { - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": 3e-5, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 - } - }, - - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 2e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 2e8, - "contiguous_gradients": true, - "cpu_offload": true - }, - - "steps_per_print": 2000, - "wall_clock_breakdown": false - } - - +Moved to :ref:`deepspeed-zero2-example`. ZeRO-3 Example +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3.json``: - - -.. code-block:: json - - { - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - }, - - "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "offload_param": { - "device": "cpu", - "pin_memory": true - }, - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e14, - "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_fp16_weights_on_model_save": true - }, - - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "steps_per_print": 2000, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false - } - -Here is a full ZeRO-3 all-enabled manually set configuration file. It is here mainly for you to see what the typical -values look like, but we highly recommend using the one with multiple ``auto`` settings in it. - -.. code-block:: json - - { - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": 3e-5, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 - } - }, - - "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "offload_param": { - "device": "cpu", - "pin_memory": true - }, - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e14, - "reduce_bucket_size": 1e6, - "stage3_prefetch_bucket_size": 0.94e6, - "stage3_param_persistence_threshold": 1e4, - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_fp16_weights_on_model_save": true - }, - - "steps_per_print": 2000, - "wall_clock_breakdown": false - } - +Moved to :ref:`deepspeed-zero3-example`. Optimizer and Scheduler ======================================================================================================================= -As long as you don't enable ``cpu_offload`` you can mix and match DeepSpeed and HuggingFace schedulers and optimizers, -with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer: - -+--------------+--------------+--------------+ -| Combos | HF Scheduler | DS Scheduler | -+--------------+--------------+--------------+ -| HF Optimizer | Yes | Yes | -+--------------+--------------+--------------+ -| DS Optimizer | No | Yes | -+--------------+--------------+--------------+ - -If ``cpu_offload`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer. - Optimizer +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are -thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here -`__. - -If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will -automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line -arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``. - -Here is an example of the auto-configured ``optimizer`` entry for ``AdamW``: - -.. code-block:: json - - { - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - } - } - - -Note that the command line arguments will set the values in the configuration file. This is so that there is one -definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to -different values in different places. Command line rules. The values that get overridden are: - -- ``lr`` with the value of ``--learning_rate`` -- ``betas`` with the value of ``--adam_beta1 --adam_beta2`` -- ``eps`` with the value of ``--adam_epsilon`` -- ``weight_decay`` with the value of ``--weight_decay`` - -Therefore please remember to tune the shared hyperparameters on the command line. - -You can also set the values explicitly: - -.. code-block:: json - - { - "optimizer": { - "type": "AdamW", - "params": { - "lr": 0.001, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - } - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. - -If you want to use another optimizer which is not listed above, you will have to add to the top level configuration. - -.. code-block:: json - - { - "zero_allow_untested_optimizer": true - } - -Similarly to ``AdamW``, you can configure other officially supported optimizers. Just remember that may have different -config values. e.g. for Adam you will want ``weight_decay`` around ``0.01``. +Moved to :ref:`deepspeed-optimizer`. Scheduler +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -DeepSpeed supports ``LRRangeTest``, ``OneCycle``, ``WarmupLR`` and ``WarmupDecayLR`` learning rate schedulers. The full -documentation is `here `__. - -Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed: - -* ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup`` -* ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``, - therefore, if you don't configure the scheduler this is scheduler that will get configured by default. - -If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use -the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version -of it. - -Here is an example of the auto-configured ``scheduler`` entry for ``WarmupLR``: - -.. code-block:: json - - { - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - } - } - -Since `"auto"` is used the :class:`~transformers.Trainer` arguments will set the correct values in the configuration -file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example, -the learning rate is set to different values in different places. Command line rules. The values that get set are: - -- ``warmup_min_lr`` with the value of ``0`` -- ``warmup_max_lr`` with the value of ``--learning_rate`` -- ``warmup_num_steps`` with the value of ``--warmup_steps`` -- ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run - time based on the environment and the size of the dataset and other command line arguments (needed for - ``WarmupDecayLR``). - -You can, of course, take over any or all of the configuration values and set those yourself: - -.. code-block:: json - - { - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 0.001, - "warmup_num_steps": 1000 - } - } - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. - -For example, for ``WarmupDecayLR``, you can use the following entry: - -.. code-block:: json - - { - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "last_batch_iteration": -1, - "total_num_steps": "auto", - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - } - } - -and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be set at loading time. - - - +Moved to :ref:`deepspeed-scheduler`. fp32 Precision ======================================================================================================================= -Deepspeed supports the full fp32 and the fp16 mixed precision. - -Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you -will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this -happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained -models). Such models may overflow or underflow leading to ``NaN`` loss. If this is your case then you will want to use -the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with: - -.. code-block:: json - - { - "fp16": { - "enabled": "false", - } - } - -If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using -the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and -benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices -`__. The document includes -instructions on how to disable this automatic conversion if for some reason you prefer not to use it. - - - +Moved to :ref:`deepspeed-fp32`. Automatic Mixed Precision ======================================================================================================================= -You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way: - -To configure pytorch AMP-like mode set: - -.. code-block:: json - - { - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - } - } - -and the :class:`~transformers.Trainer` will automatically enable or disable it based on the value of -``args.fp16_backend``. The rest of config values are up to you. - -This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed. - -You can also enable/disable this mode explicitly: - -.. code-block:: json - - { - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - } - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. - -Here is the `documentation `__. - -To configure apex AMP-like mode set: - -.. code-block:: json - - "amp": { - "enabled": "auto", - "opt_level": "auto" - } - -and the :class:`~transformers.Trainer` will automatically configure it based on the values of ``args.fp16_backend`` and -``args.fp16_opt_level``. - -This mode gets enabled when ``--fp16 --fp16_backend apex --fp16_opt_level 01`` command line args are passed. - -You can also configure this mode explicitly: - -.. code-block:: json - - { - "amp": { - "enabled": true, - "opt_level": "O1" - } - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. - -Here is the `documentation -`__. - +Moved to :ref:`deepspeed-amp`. Batch Size ======================================================================================================================= -To configure batch size, use: - -.. code-block:: json - - { - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto" - } - -and the :class:`~transformers.Trainer` will automatically set ``train_micro_batch_size_per_gpu`` to the value of -``args.per_device_train_batch_size`` and ``train_batch_size`` to ``args.world_size * args.per_device_train_batch_size * -args.gradient_accumulation_steps``. - -You can also set the values explicitly: - -.. code-block:: json - - { - "train_batch_size": 12, - "train_micro_batch_size_per_gpu": 4 - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. +Moved to :ref:`deepspeed-bs`. Gradient Accumulation ======================================================================================================================= -To configure gradient accumulation set: - -.. code-block:: json - - { - "gradient_accumulation_steps": "auto" - } - -and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.gradient_accumulation_steps``. - -You can also set the value explicitly: - -.. code-block:: json - - { - "gradient_accumulation_steps": 3 - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. +Moved to :ref:`deepspeed-grad-acc`. Gradient Clipping ======================================================================================================================= -To configure gradient gradient clipping set: - -.. code-block:: json - - { - "gradient_clipping": "auto" - } - -and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.max_grad_norm``. - -You can also set the value explicitly: - -.. code-block:: json - - { - "gradient_clipping": 1.0 - } - -But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed -configuration. - +Moved to :ref:`deepspeed-grad-clip`. Getting The Model Weights Out ======================================================================================================================= -As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores -fp32 master weights in its custom checkpoint optimizer files, which are ``global_step*/*optim_states.pt`` (this is glob -pattern), and are saved under the normal checkpoint. - -**FP16 Weights:** - -When a model is saved under ZeRO-2, you end up having the normal ``pytorch_model.bin`` file with the model weights, but -they are only the fp16 version of the weights. - -Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs, -therefore ``"stage3_gather_fp16_weights_on_model_save": true`` is required to get the ``Trainer`` to save the fp16 -version of the weights. If this setting is ``False`` ``pytorch_model.bin`` won't be created. This is because by default -DeepSpeed's ``state_dict`` contains a placeholder and not the real weights. If we were to save this ``state_dict`` it -won't be possible to load it back. - - -.. code-block:: json - - { - "zero_optimization": { - "stage3_gather_fp16_weights_on_model_save": true - } - } - - -**FP32 Weights:** - -While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to -the `models hub `__ or pass it to someone else you most likely will want to get the fp32 -weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this -is performed offline. - -DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint -folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to -have the configuration file or a ``Trainer`` to do the extraction. - -Let's say your checkpoint folder looks like this: - -.. code-block:: bash - - $ ls -l output_dir/checkpoint-1/ - -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json - drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/ - -rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest - -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt - -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin - -rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt - -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json - -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model - -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json - -rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json - -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin - -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py* - -In this example there is just one DeepSpeed checkpoint sub-folder `global_step1`. Therefore to reconstruct the fp32 -weights just run: - -.. code-block:: bash - - python zero_to_fp32.py global_step1 pytorch_model.bin - -The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint. - -``python zero_to_fp32.py -h`` will give you usage details. - -If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights. - -This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs. - -Note: currently the script requires 2x general RAM of the final fp32 model weights. - - -ZeRO-3 and Infinity Nuances -======================================================================================================================= - -ZeRO-3 is quite different from ZeRO-2 because of its param sharding feature. - -ZeRO-Infinity further extends ZeRO-3 to support NVMe memory and multiple other speed and scalability improvements. - -While all the efforts were made for things to just work without needing any special changes to your models, in certain -circumstances you may find the following information to be needed. - - - -Constructing Massive Models -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -DeepSpeed/ZeRO-3 can handle models with Trillions of parameters which may not fit onto the existing RAM. In such cases, -but also if you want the initialization to happen much faster, initialize the model using `deepspeed.zero.Init()` -context manager (which is also a function decorator), like so: - -.. code-block:: python - - from transformers import T5ForConditionalGeneration, T5Config - import deepspeed - with deepspeed.zero.Init(): - config = T5Config.from_pretrained("t5-small") - model = T5ForConditionalGeneration(config) - -As you can see this gives you a randomly initialized model. - -If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as -``is_deepspeed_zero3_enabled()`` returns ``True``, which currently is setup by the -class:`~transformers.TrainingArguments` object if the passed DeepSpeed configuration file contains ZeRO-3 config -section. Thus you must create the :class:`~transformers.TrainingArguments` object **before** calling -``from_pretrained``. Here is an example of a possible sequence: - -.. code-block:: python - - from transformers import AutoModel, Trainer, TrainingArguments - training_args = TrainingArguments(..., deepspeed=ds_config) - model = AutoModel.from_pretrained("t5-small") - trainer = Trainer(model=model, args=training_args, ...) - -If you're using the official example scripts and your command line arguments include ``--deepspeed ds_config.json`` -with ZeRO-3 config enabled, then everything is already done for you, since this is how example scripts are written. - -Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used. - -For full details on this method and other related features please refer to `Constructing Massive Models -`__. - - - -Gathering Parameters -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -Under ZeRO-3 on multiple GPUs no single GPU has all the parameters unless it's the parameters for the currently -executing layer. So if you need to access all parameters from all layers at once there is a specific method to do it. -Most likely you won't need it, but if you do please refer to `Gathering Parameters -`__ - -We do however use it internally in several places, one such example is when loading pretrained model weights in -``from_pretrained``. We load one layer at a time and immediately partition it to all participating GPUs, as for very -large models it won't be possible to load it on one GPU and then spread it out to multiple GPUs, due to memory -limitations. - -Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like: - -.. code-block:: python - - tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True) - -stress on ``tensor([1.])``, or if you get an error where it says the parameter is of size ``1``, instead of some much -larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder. - - -Troubleshooting -======================================================================================================================= - -* ``deepspeed`` process gets killed at startup without a traceback - -If the ``deepspeed`` process gets killed at launch time without a traceback, that usually means that the program tried -to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that -process. This is because your configuration file most likely has either ``offload_optimizer`` or ``offload_param`` or -both configured to offload to ``cpu`` (or under ZeRO-2 ``cpu_offload`` is enabled). If you have NVMe, experiment with -offloading to NVMe if you're running under ZeRO-3. - -Work is being done to enable estimating how much memory is needed for a specific model: `PR -`__. - - - - - - -Notes -======================================================================================================================= - -* DeepSpeed works with the PyTorch :class:`~transformers.Trainer` but not TF :class:`~transformers.TFTrainer`. -* While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source - `__ to best match your hardware and also if you need to enable - certain features, like 1-bit Adam, which aren't available in the pypi distribution. -* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with 🤗 Transformers - you can use any model - with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration instructions - `__. - - -Main DeepSpeed Resources -======================================================================================================================= - -- `Project's github `__ -- `Usage docs `__ -- `API docs `__ -- `Blog posts `__ - -Papers: - -- `ZeRO: Memory Optimizations Toward Training Trillion Parameter Models `__ -- `ZeRO-Offload: Democratizing Billion-Scale Model Training `__ -- `ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning `__ - -Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you -have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub -`__. +Moved to :ref:`deepspeed-weight-extraction`. diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py new file mode 100644 index 0000000000..63185562c9 --- /dev/null +++ b/src/transformers/deepspeed.py @@ -0,0 +1,318 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Integration with Deepspeed +""" + +import importlib.util +import io +import json +import weakref +from copy import deepcopy + +from .dependency_versions_check import dep_version_check +from .utils import logging + + +logger = logging.get_logger(__name__) + + +def is_deepspeed_available(): + return importlib.util.find_spec("deepspeed") is not None + + +def _is_true(config, key): + if config is None: + return False + return bool(config.get(key)) + + +def _set_if_auto(config, key, val): + if config is None: + return + if config.get(key) == "auto": + config[key] = val + + +class HfDeepSpeedConfig: + """ + This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. + + A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where + things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``). + Therefore it's important that this object remains alive while the program is still running. + + :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to + sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder + values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way. + + Args: + config_file_or_dict (:obj:`Union[str, Dict]`) - path to DeepSpeed config file or dict. + + """ + + def __init__(self, config_file_or_dict): + # set global weakref object + set_hf_deepspeed_config(self) + + dep_version_check("deepspeed") + + if isinstance(config_file_or_dict, dict): + # Don't modify user's data should they want to reuse it (e.g. in tests), because once we + # modified it, it will not be accepted here again, since `auto` values would have been overriden + config = deepcopy(config_file_or_dict) + elif isinstance(config_file_or_dict, str): + with io.open(config_file_or_dict, "r", encoding="utf-8") as f: + config = json.load(f) + else: + raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict") + self.config = config + + # zero stage - this is done as early as possible, before model is created, to allow + # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object + # during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc. + config_zero = config.get("zero_optimization", {}) + self.stage = config_zero.get("stage", 0) + + # offload + self.offload = False + config_zero = config.get("zero_optimization", {}) + if self.is_zero2(): + self.offload = _is_true(config_zero, "cpu_offload") + elif self.is_zero3(): + offload_devices = ["cpu", "nvme"] + if config_zero.get("offload_optimizer", {}).get("device") in offload_devices: + self.offload = True + if config_zero.get("offload_param", {}).get("device") in offload_devices: + self.offload = True + + def is_zero2(self): + return self.stage == 2 + + def is_zero3(self): + return self.stage == 3 + + def is_offload(self): + return self.offload + + +class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): + """ + The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has + the same lifespan as the latter. + + """ + + def __init__(self, config_file_or_dict): + super().__init__(config_file_or_dict) + + def trainer_config_process(self, args): + """ + Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object + creation. + """ + config = self.config + + # DeepSpeed does: + # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps + train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps + _set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size) + _set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps) + _set_if_auto(config, "train_batch_size", train_batch_size) + _set_if_auto(config, "gradient_clipping", args.max_grad_norm) + + config_optim = config.get("optimizer", {}) + if config_optim != {}: + config_optim_params = config_optim.get("params") + _set_if_auto(config_optim_params, "lr", args.learning_rate) + _set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2]) + _set_if_auto(config_optim_params, "eps", args.adam_epsilon) + _set_if_auto(config_optim_params, "weight_decay", args.weight_decay) + + config_sched = config.get("scheduler", {}) + if config_sched != {}: + config_sched_params = config_sched.get("params") + _set_if_auto(config_sched_params, "warmup_min_lr", 0) + _set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate) + _set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps) + # total_num_steps - will get set in trainer_config_finalize + + # fp16 + if args.fp16: + fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" + else: + fp16_backend = None + + # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set + # any here unless the user did the work + config_fp16 = config.get("fp16") + _set_if_auto(config_fp16, "enabled", fp16_backend == "amp") + + # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any + # ZeRO features + config_amp = config.get("amp") + _set_if_auto(config_amp, "enabled", fp16_backend == "apex") + _set_if_auto(config_amp, "opt_level", args.fp16_opt_level) + + def trainer_config_finalize(self, args, model, num_training_steps): + """ + This stage is run after we have the model and know num_training_steps. + + Now we we can complete the configuration process. + """ + config = self.config + + # zero + config_zero = config.get("zero_optimization", {}) + if self.is_zero3(): + # automatically assign the optimal config values based on model config + hidden_size = model.config.hidden_size + _set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size) + _set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) + _set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size) + + # scheduler + config_sched = config.get("scheduler", {}) + config_sched_params = config_sched.get("params", {}) + _set_if_auto(config_sched_params, "total_num_steps", num_training_steps) + + +# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle +_hf_deepspeed_config_weak_ref = None + + +def set_hf_deepspeed_config(hf_deepspeed_config_obj): + # this is a special weakref global object to allow us to get to Deepspeed config from APIs + # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. + global _hf_deepspeed_config_weak_ref + # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed) + _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) + + +def is_deepspeed_zero3_enabled(): + if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: + return _hf_deepspeed_config_weak_ref().is_zero3() + else: + return False + + +def deepspeed_config(): + if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: + return _hf_deepspeed_config_weak_ref().config + else: + return None + + +def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): + """ + Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. + + If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. + + Args: + trainer: Trainer object + num_training_steps: per single gpu + resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load + + Returns: model, optimizer, lr_scheduler + + """ + import deepspeed + + model = trainer.model + + hf_deepspeed_config = trainer.args.hf_deepspeed_config + hf_deepspeed_config.trainer_config_finalize(trainer.args, model, num_training_steps) + + # resume config update - some bits like `model` and `num_training_steps` only become available during train + config = hf_deepspeed_config.config + + # Optimizer + Scheduler + # Currently supported combos: + # 1. DS scheduler + DS optimizer: Yes + # 2. HF scheduler + HF optimizer: Yes + # 3. DS scheduler + HF optimizer: Yes + # 4. HF scheduler + DS optimizer: No + # + # Unless Offload is enabled in which case it's: + # 1. DS scheduler + DS optimizer: Yes + # 2. HF scheduler + HF optimizer: No + # 3. DS scheduler + HF optimizer: No + # 4. HF scheduler + DS optimizer: No + + optimizer = None + if "optimizer" not in config: + if hf_deepspeed_config.is_offload(): + raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers") + + # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. + # But trainer uses AdamW by default. + trainer.create_optimizer() + optimizer = trainer.optimizer + # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` + config["zero_allow_untested_optimizer"] = True + + # DS schedulers (deepspeed/runtime/lr_schedules.py): + # + # DS name | --lr_scheduler_type | HF func | Notes + # -------------| ---------------------|-----------------------------------|-------------------- + # LRRangeTest | na | na | LRRT + # OneCycle | na | na | 1CLR + # WarmupLR | constant_with_warmup | get_constant_schedule_with_warmup | w/ warmup_min_lr=0 + # WarmupDecayLR| linear | get_linear_schedule_with_warmup | + lr_scheduler = None + if "scheduler" not in config: + if "optimizer" in config: + # to make this option work, we need to init DS optimizer first, then init HS scheduler, + # then pass the HS scheduler to DS init, which is not possible at the moment + raise ValueError("At the moment HF scheduler + DeepSpeed optimizer combination is not possible") + else: + trainer.create_scheduler(num_training_steps=num_training_steps) + lr_scheduler = trainer.lr_scheduler + + # keep for quick debug: + # from pprint import pprint; pprint(config) + + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + + model, optimizer, _, lr_scheduler = deepspeed.initialize( + model=model, + model_parameters=model_parameters, + config_params=config, + optimizer=optimizer, + lr_scheduler=lr_scheduler, + ) + + if resume_from_checkpoint is not None: + + # it's possible that the user is trying to resume from model_path, which doesn't necessarily + # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's + # a resume from a checkpoint and not just a local pretrained weight. So we check here if the + # path contains what looks like a deepspeed checkpoint + import glob + + deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*")) + + if len(deepspeed_checkpoint_dirs) > 0: + logger.info(f"Attempting to resume from {resume_from_checkpoint}") + # this magically updates self.optimizer and self.lr_scheduler + load_path, _ = model.load_checkpoint( + resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True + ) + if load_path is None: + raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}") + else: + logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing") + + return model, optimizer, lr_scheduler diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index 7629d21b65..aac705b475 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -15,16 +15,11 @@ Integrations with other Python libraries. """ import importlib.util -import io -import json import numbers import os import tempfile -import weakref -from copy import deepcopy from pathlib import Path -from .dependency_versions_check import dep_version_check from .utils import logging @@ -101,10 +96,6 @@ def is_fairscale_available(): return importlib.util.find_spec("fairscale") is not None -def is_deepspeed_available(): - return importlib.util.find_spec("deepspeed") is not None - - def is_neptune_available(): return importlib.util.find_spec("neptune") is not None @@ -273,292 +264,6 @@ def rewrite_logs(d): return new_d -def _is_true(config, key): - if config is None: - return False - return bool(config.get(key)) - - -def _set_if_auto(config, key, val): - if config is None: - return - if config.get(key) == "auto": - config[key] = val - - -class HfDeepSpeedConfig: - """ - This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. - - A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where - things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``). - Therefore it's important that this object remains alive while the program is still running. - - :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to - sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder - values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way. - - Args: - config_file_or_dict (:obj:`Union[str, Dict]`) - path to DeepSpeed config file or dict. - - """ - - def __init__(self, config_file_or_dict): - # set global weakref object - set_hf_deepspeed_config(self) - - dep_version_check("deepspeed") - - if isinstance(config_file_or_dict, dict): - # Don't modify user's data should they want to reuse it (e.g. in tests), because once we - # modified it, it will not be accepted here again, since `auto` values would have been overriden - config = deepcopy(config_file_or_dict) - elif isinstance(config_file_or_dict, str): - with io.open(config_file_or_dict, "r", encoding="utf-8") as f: - config = json.load(f) - else: - raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict") - self.config = config - - # zero stage - this is done as early as possible, before model is created, to allow - # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object - # during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc. - config_zero = config.get("zero_optimization", {}) - self.stage = config_zero.get("stage", 0) - - # offload - self.offload = False - config_zero = config.get("zero_optimization", {}) - if self.is_zero2(): - self.offload = _is_true(config_zero, "cpu_offload") - elif self.is_zero3(): - offload_devices = ["cpu", "nvme"] - if config_zero.get("offload_optimizer", {}).get("device") in offload_devices: - self.offload = True - if config_zero.get("offload_param", {}).get("device") in offload_devices: - self.offload = True - - def is_zero2(self): - return self.stage == 2 - - def is_zero3(self): - return self.stage == 3 - - def is_offload(self): - return self.offload - - -class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): - """ - The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has - the same lifespan as the latter. - - """ - - def __init__(self, config_file_or_dict): - super().__init__(config_file_or_dict) - - def trainer_config_process(self, args): - """ - Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object - creation. - """ - config = self.config - - # DeepSpeed does: - # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps - train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps - _set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size) - _set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps) - _set_if_auto(config, "train_batch_size", train_batch_size) - _set_if_auto(config, "gradient_clipping", args.max_grad_norm) - - config_optim = config.get("optimizer", {}) - if config_optim != {}: - config_optim_params = config_optim.get("params") - _set_if_auto(config_optim_params, "lr", args.learning_rate) - _set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2]) - _set_if_auto(config_optim_params, "eps", args.adam_epsilon) - _set_if_auto(config_optim_params, "weight_decay", args.weight_decay) - - config_sched = config.get("scheduler", {}) - if config_sched != {}: - config_sched_params = config_sched.get("params") - _set_if_auto(config_sched_params, "warmup_min_lr", 0) - _set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate) - _set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps) - # total_num_steps - will get set in trainer_config_finalize - - # fp16 - if args.fp16: - fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" - else: - fp16_backend = None - - # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set - # any here unless the user did the work - config_fp16 = config.get("fp16") - _set_if_auto(config_fp16, "enabled", fp16_backend == "amp") - - # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any - # ZeRO features - config_amp = config.get("amp") - _set_if_auto(config_amp, "enabled", fp16_backend == "apex") - _set_if_auto(config_amp, "opt_level", args.fp16_opt_level) - - def trainer_config_finalize(self, args, model, num_training_steps): - """ - This stage is run after we have the model and know num_training_steps. - - Now we we can complete the configuration process. - """ - config = self.config - - # zero - config_zero = config.get("zero_optimization", {}) - if self.is_zero3(): - # automatically assign the optimal config values based on model config - hidden_size = model.config.hidden_size - _set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size) - _set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) - _set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size) - - # scheduler - config_sched = config.get("scheduler", {}) - config_sched_params = config_sched.get("params", {}) - _set_if_auto(config_sched_params, "total_num_steps", num_training_steps) - - -# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle -_hf_deepspeed_config_weak_ref = None - - -def set_hf_deepspeed_config(hf_deepspeed_config_obj): - # this is a special weakref global object to allow us to get to Deepspeed config from APIs - # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. - global _hf_deepspeed_config_weak_ref - # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed) - _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) - - -def is_deepspeed_zero3_enabled(): - if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: - return _hf_deepspeed_config_weak_ref().is_zero3() - else: - return False - - -def deepspeed_config(): - if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: - return _hf_deepspeed_config_weak_ref().config - else: - return None - - -def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): - """ - Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. - - If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. - - Args: - trainer: Trainer object - num_training_steps: per single gpu - resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load - - Returns: model, optimizer, lr_scheduler - - """ - import deepspeed - - model = trainer.model - - hf_deepspeed_config = trainer.args.hf_deepspeed_config - hf_deepspeed_config.trainer_config_finalize(trainer.args, model, num_training_steps) - - # resume config update - some bits like `model` and `num_training_steps` only become available during train - config = hf_deepspeed_config.config - - # Optimizer + Scheduler - # Currently supported combos: - # 1. DS scheduler + DS optimizer: Yes - # 2. HF scheduler + HF optimizer: Yes - # 3. DS scheduler + HF optimizer: Yes - # 4. HF scheduler + DS optimizer: No - # - # Unless Offload is enabled in which case it's: - # 1. DS scheduler + DS optimizer: Yes - # 2. HF scheduler + HF optimizer: No - # 3. DS scheduler + HF optimizer: No - # 4. HF scheduler + DS optimizer: No - - optimizer = None - if "optimizer" not in config: - if hf_deepspeed_config.is_offload(): - raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers") - - # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. - # But trainer uses AdamW by default. - trainer.create_optimizer() - optimizer = trainer.optimizer - # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` - config["zero_allow_untested_optimizer"] = True - - # DS schedulers (deepspeed/runtime/lr_schedules.py): - # - # DS name | --lr_scheduler_type | HF func | Notes - # -------------| ---------------------|-----------------------------------|-------------------- - # LRRangeTest | na | na | LRRT - # OneCycle | na | na | 1CLR - # WarmupLR | constant_with_warmup | get_constant_schedule_with_warmup | w/ warmup_min_lr=0 - # WarmupDecayLR| linear | get_linear_schedule_with_warmup | - lr_scheduler = None - if "scheduler" not in config: - if "optimizer" in config: - # to make this option work, we need to init DS optimizer first, then init HS scheduler, - # then pass the HS scheduler to DS init, which is not possible at the moment - raise ValueError("At the moment HF scheduler + DeepSpeed optimizer combination is not possible") - else: - trainer.create_scheduler(num_training_steps=num_training_steps) - lr_scheduler = trainer.lr_scheduler - - # keep for quick debug: - # from pprint import pprint; pprint(config) - - model_parameters = filter(lambda p: p.requires_grad, model.parameters()) - - model, optimizer, _, lr_scheduler = deepspeed.initialize( - model=model, - model_parameters=model_parameters, - config_params=config, - optimizer=optimizer, - lr_scheduler=lr_scheduler, - ) - - if resume_from_checkpoint is not None: - - # it's possible that the user is trying to resume from model_path, which doesn't necessarily - # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's - # a resume from a checkpoint and not just a local pretrained weight. So we check here if the - # path contains what looks like a deepspeed checkpoint - import glob - - deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*")) - - if len(deepspeed_checkpoint_dirs) > 0: - logger.info(f"Attempting to resume from {resume_from_checkpoint}") - # this magically updates self.optimizer and self.lr_scheduler - load_path, _ = model.load_checkpoint( - resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True - ) - if load_path is None: - raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}") - else: - logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing") - - return model, optimizer, lr_scheduler - - class TensorBoardCallback(TrainerCallback): """ A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9ab8824067..109561e26d 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -29,6 +29,7 @@ from torch.nn import functional as F from .activations import get_activation from .configuration_utils import PretrainedConfig +from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled from .file_utils import ( CONFIG_NAME, DUMMY_INPUTS, @@ -45,7 +46,6 @@ from .file_utils import ( replace_return_docstrings, ) from .generation_utils import GenerationMixin -from .integrations import deepspeed_config, is_deepspeed_zero3_enabled from .utils import logging diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index 86f50376e4..0d82184be5 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -17,8 +17,8 @@ import types from ...configuration_utils import PretrainedConfig +from ...deepspeed import deepspeed_config, is_deepspeed_zero3_enabled from ...file_utils import copy_func -from ...integrations import deepspeed_config, is_deepspeed_zero3_enabled from ...utils import logging from .configuration_auto import AutoConfig, replace_list_option_in_docstrings diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 879a9c66d8..69fb09b998 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -44,8 +44,6 @@ from .integrations import ( # isort: split is_ray_tune_available, run_hp_search_optuna, run_hp_search_ray, - deepspeed_init, - is_deepspeed_zero3_enabled, ) import numpy as np @@ -61,6 +59,7 @@ from . import __version__ from .configuration_utils import PretrainedConfig from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from .debug_utils import DebugOption, DebugUnderflowOverflow +from .deepspeed import deepspeed_init, is_deepspeed_zero3_enabled from .dependency_versions_check import dep_version_check from .file_utils import ( CONFIG_NAME, @@ -863,7 +862,7 @@ class Trainer: logger.info("Trial:", trial.params) if self.args.deepspeed: # Rebuild the deepspeed config to reflect the updated training parameters - from transformers.integrations import HfDeepSpeedConfig + from transformers.deepspeed import HfDeepSpeedConfig self.args.hf_deepspeed_config = HfDeepSpeedConfig(self.args) diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py index 92d9958fa0..c008ce40b9 100644 --- a/src/transformers/trainer_seq2seq.py +++ b/src/transformers/trainer_seq2seq.py @@ -19,7 +19,7 @@ from packaging import version from torch import nn from torch.utils.data.dataset import Dataset -from .integrations import is_deepspeed_zero3_enabled +from .deepspeed import is_deepspeed_zero3_enabled from .trainer import Trainer from .trainer_utils import PredictionOutput from .utils import logging diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b00bbdf581..91e9b6f57d 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -671,7 +671,7 @@ class TrainingArguments: if self.deepspeed: # - must be run very last in arg parsing, since it will use a lot of these settings. # - must be run before the model is created. - from transformers.integrations import HfTrainerDeepSpeedConfig + from transformers.deepspeed import HfTrainerDeepSpeedConfig # will be used later by the Trainer # note: leave self.deepspeed unmodified in case a user relies on it not to be modified) @@ -739,7 +739,7 @@ class TrainingArguments: # deepspeed ./program.py # rather than: # python -m torch.distributed.launch --nproc_per_node=2 ./program.py - from .integrations import is_deepspeed_available + from .deepspeed import is_deepspeed_available if not is_deepspeed_available(): raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 3cdc85f44e..98dc185888 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -21,8 +21,8 @@ from copy import deepcopy from parameterized import parameterized from transformers import AutoModel, TrainingArguments, is_torch_available, logging +from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available from transformers.file_utils import WEIGHTS_NAME -from transformers.integrations import HfDeepSpeedConfig, is_deepspeed_available from transformers.testing_utils import ( CaptureLogger, CaptureStderr, @@ -71,7 +71,7 @@ def require_deepspeed(test_case): if is_deepspeed_available(): from deepspeed.utils import logger as deepspeed_logger # noqa - from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled # noqa + from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled # noqa ZERO2 = "zero2" ZERO3 = "zero3"