From 5b49376202863d3798d2ff8a8ba61590542a1141 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 6 Feb 2023 19:39:13 -0500 Subject: [PATCH] Deprecate parallelize API (#21448) * Deprecate parallelize API * Add documentation * Fix copies --- docs/source/en/model_doc/gpt2.mdx | 4 -- docs/source/en/model_doc/t5.mdx | 6 --- src/transformers/models/gpt2/modeling_gpt2.py | 34 ++++++++++++++ src/transformers/models/gptj/modeling_gptj.py | 23 ++++++++++ src/transformers/models/mt5/modeling_mt5.py | 44 +++++++++++++++++++ src/transformers/models/t5/modeling_t5.py | 44 +++++++++++++++++++ 6 files changed, 145 insertions(+), 10 deletions(-) diff --git a/docs/source/en/model_doc/gpt2.mdx b/docs/source/en/model_doc/gpt2.mdx index caa23c337f..d2640b61d2 100644 --- a/docs/source/en/model_doc/gpt2.mdx +++ b/docs/source/en/model_doc/gpt2.mdx @@ -89,15 +89,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] GPT2Model - forward - - parallelize - - deparallelize ## GPT2LMHeadModel [[autodoc]] GPT2LMHeadModel - forward - - parallelize - - deparallelize ## GPT2DoubleHeadsModel diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.mdx index 995816061c..aed8436816 100644 --- a/docs/source/en/model_doc/t5.mdx +++ b/docs/source/en/model_doc/t5.mdx @@ -360,22 +360,16 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] T5Model - forward - - parallelize - - deparallelize ## T5ForConditionalGeneration [[autodoc]] T5ForConditionalGeneration - forward - - parallelize - - deparallelize ## T5EncoderModel [[autodoc]] T5EncoderModel - forward - - parallelize - - deparallelize ## TFT5Model diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 9197a1f56d..00d2bd7f11 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -17,6 +17,7 @@ import math import os +import warnings from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -689,6 +690,13 @@ class GPT2Model(GPT2PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): # Check validity of device_map + warnings.warn( + "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your" + " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1," + " ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map ) @@ -708,6 +716,10 @@ class GPT2Model(GPT2PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.model_parallel = False self.device_map = None self.first_device = "cpu" @@ -955,6 +967,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load" + " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':" + " 0, 'transformer.h.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) if device_map is None @@ -967,6 +986,10 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.transformer.deparallelize() self.transformer = self.transformer.to("cpu") self.lm_head = self.lm_head.to("cpu") @@ -1134,6 +1157,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should" + " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your" + " own `device_map` but it needs to be a dictionary module_name to device, so for instance" + " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) if device_map is None @@ -1147,6 +1177,10 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.transformer.deparallelize() self.transformer = self.transformer.to("cpu") self.lm_head = self.lm_head.to("cpu") diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 3459a93b5d..bb19df5e7f 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -14,6 +14,7 @@ # limitations under the License. """ PyTorch GPT-J model.""" +import warnings from typing import Optional, Tuple, Union import torch @@ -489,6 +490,13 @@ class GPTJModel(GPTJPreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your" + " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1," + " ...}", + FutureWarning, + ) # Check validity of device_map self.device_map = ( get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map @@ -508,6 +516,10 @@ class GPTJModel(GPTJPreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.model_parallel = False self.device_map = None self.first_device = "cpu" @@ -724,6 +736,13 @@ class GPTJForCausalLM(GPTJPreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load" + " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':" + " 0, 'transformer.h.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) if device_map is None @@ -736,6 +755,10 @@ class GPTJForCausalLM(GPTJPreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.transformer.deparallelize() self.transformer = self.transformer.to("cpu") self.lm_head = self.lm_head.to("cpu") diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 07fab1ce54..dd047fd7c6 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -843,6 +843,13 @@ class MT5Stack(MT5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`MT5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" + " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," + " 'block.1': 1, ...}", + FutureWarning, + ) # Check validity of device_map self.device_map = ( get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map @@ -864,6 +871,10 @@ class MT5Stack(MT5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.model_parallel = False self.device_map = None self.first_device = "cpu" @@ -1314,6 +1325,13 @@ class MT5Model(MT5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5Model.parallelize def parallelize(self, device_map=None): + warnings.warn( + "`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" + " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':" + " 0, 'encoder.block.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None @@ -1327,6 +1345,10 @@ class MT5Model(MT5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5Model.deparallelize def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.encoder.deparallelize() self.decoder.deparallelize() self.encoder = self.encoder.to("cpu") @@ -1539,6 +1561,13 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize def parallelize(self, device_map=None): + warnings.warn( + "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you" + " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also" + " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance" + " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None @@ -1553,6 +1582,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.encoder.deparallelize() self.decoder.deparallelize() self.encoder = self.encoder.to("cpu") @@ -1849,6 +1882,13 @@ class MT5EncoderModel(MT5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.parallelize def parallelize(self, device_map=None): + warnings.warn( + "`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load" + " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," + " 'block.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None @@ -1861,6 +1901,10 @@ class MT5EncoderModel(MT5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.deparallelize def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.encoder.deparallelize() self.encoder = self.encoder.to("cpu") self.model_parallel = False diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 49adf4c421..0e93f51e70 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -872,6 +872,13 @@ class T5Stack(T5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" + " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," + " 'block.1': 1, ...}", + FutureWarning, + ) # Check validity of device_map self.device_map = ( get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map @@ -893,6 +900,10 @@ class T5Stack(T5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.model_parallel = False self.device_map = None self.first_device = "cpu" @@ -1318,6 +1329,13 @@ class T5Model(T5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" + " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':" + " 0, 'encoder.block.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None @@ -1330,6 +1348,10 @@ class T5Model(T5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.encoder.deparallelize() self.decoder.deparallelize() self.encoder = self.encoder.to("cpu") @@ -1515,6 +1537,13 @@ class T5ForConditionalGeneration(T5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you" + " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also" + " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance" + " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None @@ -1528,6 +1557,10 @@ class T5ForConditionalGeneration(T5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.encoder.deparallelize() self.decoder.deparallelize() self.encoder = self.encoder.to("cpu") @@ -1790,6 +1823,13 @@ class T5EncoderModel(T5PreTrainedModel): @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): + warnings.warn( + "`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load" + " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" + " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0," + " 'block.1': 1, ...}", + FutureWarning, + ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None @@ -1801,6 +1841,10 @@ class T5EncoderModel(T5PreTrainedModel): @add_start_docstrings(DEPARALLELIZE_DOCSTRING) def deparallelize(self): + warnings.warn( + "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) self.encoder.deparallelize() self.encoder = self.encoder.to("cpu") self.model_parallel = False