From 8ce133063120683018b214fe10d1449e4c2401da Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 4 Feb 2022 13:51:02 -0800 Subject: [PATCH] [deepspeed docs] DeepSpeed ZeRO Inference (#15486) * [deepspeed docs] DeepSpeed ZeRO Inference * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * tweak * deal with black * extra cleanup, better comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/main_classes/deepspeed.mdx | 164 +++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx index 3fdf694629..a558cf5390 100644 --- a/docs/source/main_classes/deepspeed.mdx +++ b/docs/source/main_classes/deepspeed.mdx @@ -1805,6 +1805,170 @@ Please note that if you're not using the [`Trainer`] integration, you're complet [[autodoc]] deepspeed.HfDeepSpeedConfig - all +### DeepSpeed ZeRO Inference + +Here is an example of how one could do DeepSpeed ZeRO Inference without using [`Trainer`] when one can't fit a model onto a single GPU. The solution includes using additional GPUs or/and offloading GPU memory to CPU memory. + +The important nuance to understand here is that the way ZeRO is designed you can process different inputs on different GPUs in parallel. + +The example has copious notes and is self-documenting. + +Make sure to: + +1. disable CPU offload if you have enough GPU memory (since it slows things down) +2. enable bf16 if you own an Ampere or a newer GPU to make things faster. If you don't have that hardware you may enable fp16 as long as you don't use any model that was pre-trained in bf16 mixed precision (such as most t5 models). These usually overflow in fp16 and you will see garbage as output. + +```python +#!/usr/bin/env python + +# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model +# into a single GPU +# +# 1. Use 1 GPU with CPU offload +# 2. Or use multiple GPUs instead +# +# First you need to install deepspeed: pip install deepspeed +# +# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2 +# small GPUs can handle it. or 1 small GPU and a lot of CPU memory. +# +# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU - +# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to +# process multiple inputs at once. +# +# The provided deepspeed config also activates CPU memory offloading, so chances are that if you +# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a +# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will +# run faster if you don't want offload to CPU - so disable that section then. +# +# To deploy on 1 gpu: +# +# deepspeed --num_gpus 1 t0.py +# or: +# python -m torch.distributed.run --nproc_per_node=1 t0.py +# +# To deploy on 2 gpus: +# +# deepspeed --num_gpus 2 t0.py +# or: +# python -m torch.distributed.run --nproc_per_node=2 t0.py + + +from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM +from transformers.deepspeed import HfDeepSpeedConfig +import deepspeed +import os +import torch + +os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers + +# distributed setup +local_rank = int(os.getenv("LOCAL_RANK", "0")) +world_size = int(os.getenv("WORLD_SIZE", "1")) +torch.cuda.set_device(local_rank) +deepspeed.init_distributed() + +model_name = "bigscience/T0_3B" + +config = AutoConfig.from_pretrained(model_name) +model_hidden_size = config.d_model + +# batch size has to be divisible by world_size, but can be bigger than world_size +train_batch_size = 1 * world_size + +# ds_config notes +# +# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be +# faster. +# +# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g. +# all official t5 models are bf16-pretrained +# +# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't +# - want CPU offload +# +# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control +# - which params should remain on gpus - the larger the value the smaller the offload size +# +# For indepth info on Deepspeed config see +# https://huggingface.co/docs/transformers/master/main_classes/deepspeed + +# keeping the same format as json for consistency, except it uses lower case for true/false +# fmt: off +ds_config = { + "fp16": { + "enabled": False + }, + "bf16": { + "enabled": False + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu", + "pin_memory": True + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, + "stage3_param_persistence_threshold": 10 * model_hidden_size + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False +} +# fmt: on + +# next line instructs transformers to partition the model directly over multiple gpus using +# deepspeed.zero.Init when model's `from_pretrained` method is called. +# +# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)** +# +# otherwise the model will first be loaded normally and only partitioned at forward time which is +# less efficient and when there is little CPU RAM may fail +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive + +# now a model can be loaded. +model = AutoModelForSeq2SeqLM.from_pretrained(model_name) + +# initialise Deepspeed ZeRO and store only the engine object +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +ds_engine.module.eval() # inference + +# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once. +# If you use more GPUs adjust for more. +# And of course if you have just one input to process you then need to pass the same string to both gpus +# If you use only one GPU, then you will have only rank 0. +rank = torch.distributed.get_rank() +if rank == 0: + text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy" +elif rank == 1: + text_in = "Is this review positive or negative? Review: this is the worst restaurant ever" + +tokenizer = AutoTokenizer.from_pretrained(model_name) +inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) +with torch.no_grad(): + outputs = ds_engine.module.generate(inputs, synced_gpus=True) +text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(f"rank{rank}:\n in={text_in}\n out={text_out}") +``` + +Let's save it as `t0.py` and run it: +``` +$ deepspeed --num_gpus 2 t0.py +rank0: + in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy + out=Positive +rank1: + in=Is this review positive or negative? Review: this is the worst restaurant ever + out=negative +``` + +This was a very basic example and you will want to adapt it to your needs. + + ## Main DeepSpeed Resources - [Project's github](https://github.com/microsoft/deepspeed)