From 02f48b9bfc9d7a1f04cce20b2df4c4b478971e6b Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 23 Nov 2020 20:14:48 -0500 Subject: [PATCH] Model parallel documentation (#8741) * Add parallelize methods to the .rst files * Correct format --- docs/source/model_doc/gpt2.rst | 4 ++-- docs/source/model_doc/t5.rst | 4 ++-- src/transformers/models/gpt2/modeling_gpt2.py | 6 ++++-- src/transformers/models/t5/modeling_t5.py | 6 ++++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst index 5572e08784..feedffe62c 100644 --- a/docs/source/model_doc/gpt2.rst +++ b/docs/source/model_doc/gpt2.rst @@ -71,14 +71,14 @@ GPT2Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.GPT2Model - :members: forward + :members: forward, parallelize, deparallelize GPT2LMHeadModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.GPT2LMHeadModel - :members: forward + :members: forward, parallelize, deparallelize GPT2DoubleHeadsModel diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst index e065daf1b4..2799028d72 100644 --- a/docs/source/model_doc/t5.rst +++ b/docs/source/model_doc/t5.rst @@ -99,14 +99,14 @@ T5Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.T5Model - :members: forward + :members: forward, parallelize, deparallelize T5ForConditionalGeneration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.T5ForConditionalGeneration - :members: forward + :members: forward, parallelize, deparallelize TFT5Model diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 12c9d14369..1d03c98b61 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -492,7 +492,8 @@ PARALLELIZE_DOCSTRING = r""" - gpt2-xl: 48 Example:: - Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules: + + # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules: model = GPT2LMHeadModel.from_pretrained('gpt2-xl') device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8], @@ -505,7 +506,8 @@ DEPARALLELIZE_DOCSTRING = r""" Moves the model to cpu from a model parallel state. Example:: - On a 4 GPU machine with gpt2-large: + + # On a 4 GPU machine with gpt2-large: model = GPT2LMHeadModel.from_pretrained('gpt2-large') device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7], diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index adba0b79fc..c35439372e 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -196,7 +196,8 @@ PARALLELIZE_DOCSTRING = r""" - t5-11b: 24 Example:: - Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules: + + # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules: model = T5ForConditionalGeneration.from_pretrained('t5-3b') device_map = {0: [0, 1, 2], @@ -209,7 +210,8 @@ DEPARALLELIZE_DOCSTRING = r""" Moves the model to cpu from a model parallel state. Example:: - On a 4 GPU machine with t5-3b: + + # On a 4 GPU machine with t5-3b: model = T5ForConditionalGeneration.from_pretrained('t5-3b') device_map = {0: [0, 1, 2],