From 78f5fe1416e74a2225e162e349cb8a53f1d39212 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 13 Jul 2021 12:07:32 -0700 Subject: [PATCH] [Deepspeed] adapt multiple models, add zero_to_fp32 tests (#12477) * zero_to_fp32 tests * args change * remove unnecessary work * use transformers.trainer_utils.get_last_checkpoint * document the new features * cleanup * wip * fix fsmt * add bert * cleanup * add xlm-roberta * electra works * cleanup * sync * split off the model zoo tests * cleanup * cleanup * cleanup * cleanup * reformat * cleanup * casing * deepspeed>=0.4.3 * adjust distilbert * Update docs/source/main_classes/deepspeed.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/main_classes/deepspeed.rst | 63 ++++- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/modeling_utils.py | 52 +++- .../models/distilbert/modeling_distilbert.py | 22 +- src/transformers/models/fsmt/modeling_fsmt.py | 30 +- .../models/wav2vec2/modeling_wav2vec2.py | 15 +- src/transformers/training_args.py | 5 +- tests/deepspeed/test_deepspeed.py | 74 +++-- tests/deepspeed/test_model_zoo.py | 259 ++++++++++++++++++ 10 files changed, 444 insertions(+), 80 deletions(-) create mode 100644 tests/deepspeed/test_model_zoo.py diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst index 619dfd4b8a..05a86b0801 100644 --- a/docs/source/main_classes/deepspeed.rst +++ b/docs/source/main_classes/deepspeed.rst @@ -1456,8 +1456,56 @@ won't be possible to load it back. While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to the `models hub `__ or pass it to someone else you most likely will want to get the fp32 -weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this -is performed offline. +weights. This ideally shouldn't be done during training since this is a process that requires a lot of memory, and +therefore best to be performed offline after the training is complete. But if desired and you have plenty of free CPU +memory it can be done in the same training script. The following sections will discuss both approaches. + + +**Live FP32 Weights Recovery:** + +This approach may not work if you model is large and you have little free CPU memory left, at the end of the training. + +If you have saved at least one checkpoint, and you want to use the latest one, you can do the following: + +.. code-block:: python + + from transformers.trainer_utils import get_last_checkpoint + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + checkpoint_dir = get_last_checkpoint(trainer.args.output_dir) + fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + +If you're using the ``--load_best_model_at_end`` class:`~transformers.TrainingArguments` argument (to track the best +checkpoint), then you can finish the training by first saving the final model explicitly and then do the same as above: + +.. code-block:: python + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final") + trainer.deepspeed.save_checkpoint(checkpoint_dir) + fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + +.. note:: + + Note, that once ``load_state_dict_from_zero_checkpoint`` was run, the ``model`` will no longer be useable in the + DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the DeepSpeed magic from it. So do this only at the very end + of the training. + +Of course, you don't have to use class:`~transformers.Trainer` and you can adjust the examples above to your own +trainer. + +If for some reason you want more refinement, you can also extract the fp32 ``state_dict`` of the weights and apply +these yourself as is shown in the following example: + +.. code-block:: python + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() + model.load_state_dict(state_dict) + + +**Offline FP32 Weights Recovery:** DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to @@ -1486,15 +1534,16 @@ weights just run: .. code-block:: bash - python zero_to_fp32.py global_step1 pytorch_model.bin + python zero_to_fp32.py . pytorch_model.bin -The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint. +This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs. + +The script will automatically be able to handle either a ZeRO-2 or ZeRO-3 checkpoint. ``python zero_to_fp32.py -h`` will give you usage details. -If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights. - -This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs. +The script will auto-discover the deepspeed sub-folder using the contents of the file ``latest``, which in the current +example will contain ``global_step1``. Note: currently the script requires 2x general RAM of the final fp32 model weights. diff --git a/setup.py b/setup.py index 7be7700a56..953e651aff 100644 --- a/setup.py +++ b/setup.py @@ -91,7 +91,7 @@ _deps = [ "cookiecutter==1.7.2", "dataclasses", "datasets", - "deepspeed>=0.4.0", + "deepspeed>=0.4.3", "docutils==0.16.0", "fairscale>0.3", "faiss-cpu", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 942a3faf99..f078623674 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -8,7 +8,7 @@ deps = { "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.4.0", + "deepspeed": "deepspeed>=0.4.3", "docutils": "docutils==0.16.0", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 20102c51a5..815c242f3e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -819,9 +819,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix if new_num_tokens is None: return old_lm_head - old_num_tokens, old_lm_head_dim = ( - old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() - ) + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None): + old_num_tokens, old_lm_head_dim = ( + old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() + ) + else: + old_num_tokens, old_lm_head_dim = ( + old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() + ) if old_num_tokens == new_num_tokens: return old_lm_head @@ -829,7 +837,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix if not isinstance(old_lm_head, nn.Linear): raise TypeError( f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}." - f"You should either use a different resize function or make sure that `old_embeddings` are an instance of {nn.Linear}." + f"You should either use a different resize function or make sure that `old_lm_head` are an instance of {nn.Linear}." ) # Build new lm head @@ -842,15 +850,35 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix num_tokens_to_copy = min(old_num_tokens, new_num_tokens) - # Copy old lm head weights to new lm head - if not transposed: - new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :] - else: - new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy] + # XXX: put the long block of code in a wrapper + if is_deepspeed_zero3_enabled(): + import deepspeed - # Copy bias weights to new lm head - if has_new_lm_head_bias: - new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy] + with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=0): + if torch.distributed.get_rank() == 0: + # Copy old lm head weights to new lm head + if not transposed: + new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[ + :num_tokens_to_copy, : + ] + else: + new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[ + :, :num_tokens_to_copy + ] + + # Copy bias weights to new lm head + if has_new_lm_head_bias: + new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy] + else: + # Copy old lm head weights to new lm head + if not transposed: + new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :] + else: + new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy] + + # Copy bias weights to new lm head + if has_new_lm_head_bias: + new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy] return new_lm_head diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 1c232cd7e1..5d6deb1385 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -18,7 +18,6 @@ """ -import copy import math import numpy as np @@ -27,6 +26,7 @@ from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import gelu +from ...deepspeed import is_deepspeed_zero3_enabled from ...file_utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -85,9 +85,19 @@ class Embeddings(nn.Module): self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: - create_sinusoidal_embeddings( - n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight - ) + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.position_embeddings.weight, modifier_rank=0): + if torch.distributed.get_rank() == 0: + create_sinusoidal_embeddings( + n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight + ) + else: + create_sinusoidal_embeddings( + n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight + ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) @@ -274,9 +284,7 @@ class Transformer(nn.Module): def __init__(self, config): super().__init__() self.n_layers = config.n_layers - - layer = TransformerBlock(config) - self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) + self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)]) def forward( self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 1f352a1cc6..83bd917844 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -36,6 +36,7 @@ from torch import Tensor, nn from torch.nn import CrossEntropyLoss, LayerNorm from ...activations import ACT2FN +from ...deepspeed import is_deepspeed_zero3_enabled from ...file_utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -658,11 +659,14 @@ class FSMTDecoder(nn.Module): [DecoderLayer(config) for _ in range(config.decoder_layers)] ) # type: List[DecoderLayer] - self.output_projection = nn.Linear( - self.embed_tokens.weight.shape[1], - self.embed_tokens.weight.shape[0], - bias=False, - ) + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.embed_tokens.weight, modifier_rank=None): + embed_tokens_weight_shape = self.embed_tokens.weight.shape + else: + embed_tokens_weight_shape = self.embed_tokens.weight.shape + self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False) self.output_projection.weight = self.embed_tokens.weight def forward( @@ -1127,19 +1131,6 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel): base_model = FSMTModel(config) self.model = base_model - def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding: - new_embeddings = super().resize_token_embeddings(new_num_tokens) - self.model.encoder.embed_tokens = new_embeddings - - new_embeddings = super().resize_token_embeddings(new_num_tokens) - self.model.decoder.embed_tokens = new_embeddings - - # XXX: this is not quite correct, as we have 2 different `new_embeddings`, and - # only one return value is expected. Needs to be redesigned in the core to support dual dicts - raise NotImplementedError("this method needs re-thinking for models with 2 separate dictionaries") - - return new_embeddings - @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) @add_end_docstrings(FSMT_GENERATION_EXAMPLE) @@ -1257,6 +1248,9 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel): def get_output_embeddings(self): return self.model.decoder.embed_tokens + def set_output_embeddings(self, value): + self.model.decoder.embed_tokens = value + class SinusoidalPositionalEmbedding(nn.Embedding): """ diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 2f1b4ed991..c51f19fae5 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -23,9 +23,8 @@ import torch import torch.utils.checkpoint from torch import nn -from transformers.deepspeed import is_deepspeed_zero3_enabled - from ...activations import ACT2FN +from ...deepspeed import is_deepspeed_zero3_enabled from ...file_utils import ( ModelOutput, add_start_docstrings, @@ -853,17 +852,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, nn.Conv1d): - if is_deepspeed_zero3_enabled(): - import deepspeed - - if hasattr(module, "weight_v") and hasattr(module, "weight_g"): - with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0): - nn.init.kaiming_normal_(module.weight.data) - else: - with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0): - nn.init.kaiming_normal_(module.weight.data) - else: - nn.init.kaiming_normal_(module.weight.data) + nn.init.kaiming_normal_(module.weight.data) if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 487e178b2b..dcb3aa6d0b 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -799,10 +799,7 @@ class TrainingArguments: device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.deepspeed: - # deepspeed performs its own DDP internally, and requires the program to be started with: - # deepspeed ./program.py - # rather than: - # python -m torch.distributed.launch --nproc_per_node=2 ./program.py + # deepspeed inits torch.distributed internally from .deepspeed import is_deepspeed_available if not is_deepspeed_available(): diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index e699b110f0..6c5fe60c47 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -37,11 +37,12 @@ from transformers.testing_utils import ( require_torch_multi_gpu, slow, ) -from transformers.trainer_utils import set_seed +from transformers.trainer_utils import get_last_checkpoint, set_seed -bindir = os.path.abspath(os.path.dirname(__file__)) -with ExtendSysPath(f"{bindir}/.."): +tests_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +root_dir = os.path.dirname(tests_dir) +with ExtendSysPath(tests_dir): from test_trainer import TrainerIntegrationCommon # noqa if is_torch_available(): @@ -49,9 +50,10 @@ with ExtendSysPath(f"{bindir}/.."): set_seed(42) -MBART_TINY = "sshleifer/tiny-mbart" + T5_SMALL = "t5-small" T5_TINY = "patrickvonplaten/t5-tiny-random" +GPT2_TINY = "sshleifer/tiny-gpt2" def load_json(path): @@ -77,8 +79,19 @@ def require_deepspeed_aio(test_case): if is_deepspeed_available(): from deepspeed.utils import logger as deepspeed_logger # noqa + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled # noqa + +def get_launcher(distributed=False): + # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup + # - it won't be able to handle that + # 2. for now testing with just 2 gpus max (since some quality tests may give different + # results with mode gpus because we use very little data) + num_gpus = min(2, get_gpu_count()) if distributed else 1 + return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() + + ZERO2 = "zero2" ZERO3 = "zero3" stages = [ZERO2, ZERO3] @@ -568,6 +581,41 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) + @parameterized.expand(stages) + def test_load_state_dict_from_zero_checkpoint(self, stage): + # test that we can load fp32 weights directly from the zero checkpoint into the current model + + output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False, before=False) + + ds_config_dict = self.get_config_dict(stage) + + kwargs = dict( + output_dir=output_dir, + train_len=4, + per_device_train_batch_size=4, + num_train_epochs=1, + save_strategy="steps", + save_steps=1, + learning_rate=0.1, + fp16=True, + deepspeed=ds_config_dict, + ) + + with mockenv_context(**self.dist_env_1_gpu): + trainer = get_regression_trainer(**kwargs) + trainer.train() + (a, b) = trainer.model.a.item(), trainer.model.b.item() + state = dataclasses.asdict(trainer.state) + + checkpoint_dir = get_last_checkpoint(output_dir) + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + + (a1, b1) = model.a.item(), model.b.item() + state1 = dataclasses.asdict(trainer.state) + self.assertEqual(a, a1) + self.assertEqual(b, b1) + self.check_trainer_state_are_the_same(state, state1) + def test_config_object(self): # test that we can switch from zero2 to zero3 in the same process for example # test is_zero, etc. @@ -809,7 +857,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"] - launcher = self.get_launcher(distributed) + launcher = get_launcher(distributed) cmd = launcher + script + args + ds_args # keep for quick debug @@ -826,7 +874,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): data_dir = self.tests_dir / "fixtures" output_dir = self.get_auto_remove_tmp_dir() args = f""" - --model_name_or_path sshleifer/tiny-gpt2 + --model_name_or_path {GPT2_TINY} --train_file {data_dir}/sample_text.txt --validation_file {data_dir}/sample_text.txt --output_dir {output_dir} @@ -846,7 +894,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"] - launcher = self.get_launcher(distributed=True) + launcher = get_launcher(distributed=True) cmd = launcher + script + args + ds_args # keep for quick debug @@ -860,7 +908,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_type gpt2 - --tokenizer_name sshleifer/tiny-gpt2 + --tokenizer_name {GPT2_TINY} --train_file {data_dir}/sample_text.txt --validation_file {data_dir}/sample_text.txt --output_dir {output_dir} @@ -877,7 +925,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split() script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"] - launcher = self.get_launcher(distributed=True) + launcher = get_launcher(distributed=True) cmd = launcher + script + args + ds_args # keep for quick debug @@ -885,11 +933,3 @@ class TestDeepSpeedWithLauncher(TestCasePlus): with CaptureStderr() as cs: execute_subprocess_async(cmd, env=self.get_env()) assert "Detected DeepSpeed ZeRO-3" in cs.err - - def get_launcher(self, distributed=False): - # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup - # - it won't be able to handle that - # 2. for now testing with just 2 gpus max (since some quality tests may give different - # results with mode gpus because we use very little data) - num_gpus = min(2, get_gpu_count()) if distributed else 1 - return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py new file mode 100644 index 0000000000..89548b7acc --- /dev/null +++ b/tests/deepspeed/test_model_zoo.py @@ -0,0 +1,259 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os +import subprocess + +from parameterized import parameterized +from transformers import is_torch_available +from transformers.testing_utils import ( + ExtendSysPath, + TestCasePlus, + execute_subprocess_async, + get_gpu_count, + require_deepspeed, + require_torch_gpu, + slow, +) +from transformers.trainer_utils import set_seed + + +tests_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +root_dir = os.path.dirname(tests_dir) +with ExtendSysPath(tests_dir): + from test_trainer import TrainerIntegrationCommon # noqa + + if is_torch_available(): + from test_trainer import RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer # noqa + + +set_seed(42) + +# translation +FSMT_TINY = "stas/tiny-wmt19-en-de" +BART_TINY = "sshleifer/bart-tiny-random" +T5_SMALL = "t5-small" +T5_TINY = "patrickvonplaten/t5-tiny-random" +MBART_TINY = "sshleifer/tiny-mbart" +MARIAN_TINY = "sshleifer/tiny-marian-en-de" + +# summarization +PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random" + +# causal lm +GPT2_TINY = "sshleifer/tiny-gpt2" +XLM_ROBERTA_TINY = "hf-internal-testing/tiny-xlm-roberta" + +# question-answering +ROBERTA_TINY = "sshleifer/tiny-distilroberta-base" + +# masked lm +DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased" +ELECTRA_TINY = "hf-internal-testing/tiny-electra" + +# classification +XLNET_TINY = "sshleifer/tiny-xlnet-base-cased" +BERT_TINY = "hf-internal-testing/tiny-bert" + + +# TODO: to add: +# albert +# deberta +# funnel +# longformer +# dpr +# gpt_neo +# camembert +# deberta-v2 +# m2m_100 +# tapas +# vit +# big_bird + + +def get_launcher(distributed=False): + # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup + # - it won't be able to handle that + # 2. for now testing with just 2 gpus max (since some quality tests may give different + # results with mode gpus because we use very little data) + num_gpus = min(2, get_gpu_count()) if distributed else 1 + return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() + + +def make_task_cmds(): + data_dir_fixtures = f"{tests_dir}/fixtures" + data_dir_samples = f"{data_dir_fixtures}/tests_samples" + data_dir_wmt = f"{data_dir_samples}/wmt_en_ro" + data_dir_xsum = f"{data_dir_samples}/xsum" + args_main = """ + --do_train + --max_train_samples 4 + --per_device_train_batch_size 2 + --num_train_epochs 1 + --fp16 + --report_to none + --overwrite_output_dir + """.split() + + # XXX: try to cover as many models as possible once (it's enough to run on one task per model) + # but need a tiny model for each + # + # should have T5_TINY, etc. global var defined + tasks2models = dict( + trans=[ + "bart", + "fsmt", + "marian", + "mbart", + "t5", + ], + sum=[ + "pegasus", + ], + clm=[ + "gpt2", + "xlm-roberta", + ], + mlm=[ + "electra", + "distilbert", + ], + qa=[ + "roberta", + ], + clas=[ + "bert", + "xlnet", + ], + ) + + scripts_dir = f"{root_dir}/examples/pytorch" + + tasks = dict( + trans=f""" + {scripts_dir}/translation/run_translation.py + --train_file {data_dir_wmt}/train.json + --source_lang en + --target_lang ro + """, + sum=f""" + {scripts_dir}/summarization/run_summarization.py + --train_file {data_dir_xsum}/sample.json + --max_source_length 12 + --max_target_length 12 + """, + clm=f""" + {scripts_dir}/language-modeling/run_clm.py + --train_file {data_dir_fixtures}/sample_text.txt + --block_size 8 + """, + mlm=f""" + {scripts_dir}/language-modeling/run_mlm.py + --train_file {data_dir_fixtures}/sample_text.txt + """, + qa=f""" + {scripts_dir}/question-answering/run_qa.py + --train_file {data_dir_samples}/SQUAD/sample.json + """, + clas=f""" + {scripts_dir}/text-classification/run_glue.py + --train_file {data_dir_samples}/MRPC/train.csv + --max_seq_length 12 + --task_name MRPC + """, + ) + + launcher = get_launcher(distributed=True) + + cmds = {} + for task, args in tasks.items(): + args = args.split() + for model in tasks2models[task]: + model_name = globals()[f"{model.upper().replace('-', '_')}_TINY"] + args_model = f"--model_name_or_path {model_name}".split() + cmds[f"{task}_{model}"] = launcher + args + args_model + args_main + + # # generation special case + # if task == "gen": + # launcher = f"deepspeed --num_nodes 1 --num_gpus 1".split() + # args_model += f"--model_type {model}".split() + # cmds[f"{task}_{model}"] = launcher + args + args_model + # else: + + return cmds + + +task_cmds = make_task_cmds() + +ZERO2 = "zero2" +ZERO3 = "zero3" +stages = [ZERO2, ZERO3] + + +def parameterized_custom_name_func(func, param_num, param): + # customize the test name generator function as we want both params to appear in the sub-test + # name, as by default it shows only the first param + param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args)) + return f"{func.__name__}_{param_based_name}" + + +# Cartesian-product of zero stages with models to test +params = list(itertools.product(stages, task_cmds.keys())) + + +@slow +@require_deepspeed +@require_torch_gpu +class TestDeepSpeedModelZoo(TestCasePlus): + """This class is for testing via an external script - can do multiple gpus""" + + def get_task_cmd(self, task, stage): + # return a ready to run train cmd + if task not in task_cmds: + raise ValueError(f"don't know of task {task}, have {task_cmds.keys()}") + + cmd = task_cmds[task] + args_ds = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() + + output_dir = self.get_auto_remove_tmp_dir() + args_out = f"--output_dir {output_dir}".split() + + cmd += args_ds + args_out + + return cmd, output_dir + + @parameterized.expand(params, name_func=parameterized_custom_name_func) + def test_zero_to_fp32(self, stage, task): + # testing the ability to do a run followed by recovery of full fp32 weights + + cmd, output_dir = self.get_task_cmd(task, stage) + + # 1. generate the checkpoint + cmd += "--save_steps 1".split() + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] + cmd)); die + execute_subprocess_async(cmd, env=self.get_env()) + + # 2. test that the fp32 weights get reconsolidated + chkpt_dir = f"{output_dir}/checkpoint-1" + recovered_model_path = f"{chkpt_dir}/out.bin" + cmd = f"{chkpt_dir}/zero_to_fp32.py {chkpt_dir} {recovered_model_path}" + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + subprocess.check_call(cmd, shell=True) + assert os.path.exists(recovered_model_path), f"{recovered_model_path} was not found" + + # possibly could also test that the resulting saved model is usable but given that we use + # random models we won't know if it's any good