From 78f5fe1416e74a2225e162e349cb8a53f1d39212 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 13 Jul 2021 12:07:32 -0700
Subject: [PATCH] [Deepspeed] adapt multiple models, add zero_to_fp32 tests
 (#12477)

* zero_to_fp32 tests

* args change

* remove unnecessary work

* use transformers.trainer_utils.get_last_checkpoint

* document the new features

* cleanup

* wip

* fix fsmt

* add bert

* cleanup

* add xlm-roberta

* electra works

* cleanup

* sync

* split off the model zoo tests

* cleanup

* cleanup

* cleanup

* cleanup

* reformat

* cleanup

* casing

* deepspeed>=0.4.3

* adjust distilbert

* Update docs/source/main_classes/deepspeed.rst

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* style

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/main_classes/deepspeed.rst        |  63 ++++-
 setup.py                                      |   2 +-
 src/transformers/dependency_versions_table.py |   2 +-
 src/transformers/modeling_utils.py            |  52 +++-
 .../models/distilbert/modeling_distilbert.py  |  22 +-
 src/transformers/models/fsmt/modeling_fsmt.py |  30 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  15 +-
 src/transformers/training_args.py             |   5 +-
 tests/deepspeed/test_deepspeed.py             |  74 +++--
 tests/deepspeed/test_model_zoo.py             | 259 ++++++++++++++++++
 10 files changed, 444 insertions(+), 80 deletions(-)
 create mode 100644 tests/deepspeed/test_model_zoo.py

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index 619dfd4b8a..05a86b0801 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1456,8 +1456,56 @@ won't be possible to load it back.
 
 While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
 the `models hub <https://huggingface.co/models>`__ or pass it to someone else you most likely will want to get the fp32
-weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this
-is performed offline.
+weights. This ideally shouldn't be done during training since this is a process that requires a lot of memory, and
+therefore best to be performed offline after the training is complete. But if desired and you have plenty of free CPU
+memory it can be done in the same training script. The following sections will discuss both approaches.
+
+
+**Live FP32 Weights Recovery:**
+
+This approach may not work if you model is large and you have little free CPU memory left, at the end of the training.
+
+If you have saved at least one checkpoint, and you want to use the latest one, you can do the following:
+
+.. code-block:: python
+
+    from transformers.trainer_utils import get_last_checkpoint
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+    fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+If you're using the ``--load_best_model_at_end`` class:`~transformers.TrainingArguments` argument (to track the best
+checkpoint), then you can finish the training by first saving the final model explicitly and then do the same as above:
+
+.. code-block:: python
+
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+    trainer.deepspeed.save_checkpoint(checkpoint_dir)
+    fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+.. note::
+
+    Note, that once ``load_state_dict_from_zero_checkpoint`` was run, the ``model`` will no longer be useable in the
+    DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the DeepSpeed magic from it. So do this only at the very end
+    of the training.
+
+Of course, you don't have to use class:`~transformers.Trainer` and you can adjust the examples above to your own
+trainer.
+
+If for some reason you want more refinement, you can also extract the fp32 ``state_dict`` of the weights and apply
+these yourself as is shown in the following example:
+
+.. code-block:: python
+
+    from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+    model = model.cpu()
+    model.load_state_dict(state_dict)
+
+
+**Offline FP32 Weights Recovery:**
 
 DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint
 folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
@@ -1486,15 +1534,16 @@ weights just run:
 
 .. code-block:: bash
 
-    python zero_to_fp32.py global_step1 pytorch_model.bin
+    python zero_to_fp32.py . pytorch_model.bin
 
-The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint.
+This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+
+The script will automatically be able to handle either a ZeRO-2 or ZeRO-3 checkpoint.
 
 ``python zero_to_fp32.py -h`` will give you usage details.
 
-If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights.
-
-This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+The script will auto-discover the deepspeed sub-folder using the contents of the file ``latest``, which in the current
+example will contain ``global_step1``.
 
 Note: currently the script requires 2x general RAM of the final fp32 model weights.
 
diff --git a/setup.py b/setup.py
index 7be7700a56..953e651aff 100644
--- a/setup.py
+++ b/setup.py
@@ -91,7 +91,7 @@ _deps = [
     "cookiecutter==1.7.2",
     "dataclasses",
     "datasets",
-    "deepspeed>=0.4.0",
+    "deepspeed>=0.4.3",
     "docutils==0.16.0",
     "fairscale>0.3",
     "faiss-cpu",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 942a3faf99..f078623674 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -8,7 +8,7 @@ deps = {
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
-    "deepspeed": "deepspeed>=0.4.0",
+    "deepspeed": "deepspeed>=0.4.3",
     "docutils": "docutils==0.16.0",
     "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 20102c51a5..815c242f3e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -819,9 +819,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         if new_num_tokens is None:
             return old_lm_head
 
-        old_num_tokens, old_lm_head_dim = (
-            old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
-        )
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None):
+                old_num_tokens, old_lm_head_dim = (
+                    old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
+                )
+        else:
+            old_num_tokens, old_lm_head_dim = (
+                old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
+            )
 
         if old_num_tokens == new_num_tokens:
             return old_lm_head
@@ -829,7 +837,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         if not isinstance(old_lm_head, nn.Linear):
             raise TypeError(
                 f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}."
-                f"You should either use a different resize function or make sure that `old_embeddings` are an instance of {nn.Linear}."
+                f"You should either use a different resize function or make sure that `old_lm_head` are an instance of {nn.Linear}."
             )
 
         # Build new lm head
@@ -842,15 +850,35 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
 
-        # Copy old lm head weights to new lm head
-        if not transposed:
-            new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
-        else:
-            new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
+        # XXX: put the long block of code in a wrapper
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
 
-        # Copy bias weights to new lm head
-        if has_new_lm_head_bias:
-            new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+            with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=0):
+                if torch.distributed.get_rank() == 0:
+                    # Copy old lm head weights to new lm head
+                    if not transposed:
+                        new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[
+                            :num_tokens_to_copy, :
+                        ]
+                    else:
+                        new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[
+                            :, :num_tokens_to_copy
+                        ]
+
+                    # Copy bias weights to new lm head
+                    if has_new_lm_head_bias:
+                        new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+        else:
+            # Copy old lm head weights to new lm head
+            if not transposed:
+                new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
+            else:
+                new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
+
+            # Copy bias weights to new lm head
+            if has_new_lm_head_bias:
+                new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
 
         return new_lm_head
 
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 1c232cd7e1..5d6deb1385 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -18,7 +18,6 @@
 """
 
 
-import copy
 import math
 
 import numpy as np
@@ -27,6 +26,7 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import gelu
+from ...deepspeed import is_deepspeed_zero3_enabled
 from ...file_utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -85,9 +85,19 @@ class Embeddings(nn.Module):
         self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
         if config.sinusoidal_pos_embds:
-            create_sinusoidal_embeddings(
-                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
-            )
+
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                with deepspeed.zero.GatheredParameters(self.position_embeddings.weight, modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        create_sinusoidal_embeddings(
+                            n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
+                        )
+            else:
+                create_sinusoidal_embeddings(
+                    n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
+                )
 
         self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
         self.dropout = nn.Dropout(config.dropout)
@@ -274,9 +284,7 @@ class Transformer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.n_layers = config.n_layers
-
-        layer = TransformerBlock(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
+        self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
 
     def forward(
         self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 1f352a1cc6..83bd917844 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -36,6 +36,7 @@ from torch import Tensor, nn
 from torch.nn import CrossEntropyLoss, LayerNorm
 
 from ...activations import ACT2FN
+from ...deepspeed import is_deepspeed_zero3_enabled
 from ...file_utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -658,11 +659,14 @@ class FSMTDecoder(nn.Module):
             [DecoderLayer(config) for _ in range(config.decoder_layers)]
         )  # type: List[DecoderLayer]
 
-        self.output_projection = nn.Linear(
-            self.embed_tokens.weight.shape[1],
-            self.embed_tokens.weight.shape[0],
-            bias=False,
-        )
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.embed_tokens.weight, modifier_rank=None):
+                embed_tokens_weight_shape = self.embed_tokens.weight.shape
+        else:
+            embed_tokens_weight_shape = self.embed_tokens.weight.shape
+        self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False)
         self.output_projection.weight = self.embed_tokens.weight
 
     def forward(
@@ -1127,19 +1131,6 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
         base_model = FSMTModel(config)
         self.model = base_model
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self.model.encoder.embed_tokens = new_embeddings
-
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self.model.decoder.embed_tokens = new_embeddings
-
-        # XXX: this is not quite correct, as we have 2 different `new_embeddings`, and
-        # only one return value is expected. Needs to be redesigned in the core to support dual dicts
-        raise NotImplementedError("this method needs re-thinking for models with 2 separate dictionaries")
-
-        return new_embeddings
-
     @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     @add_end_docstrings(FSMT_GENERATION_EXAMPLE)
@@ -1257,6 +1248,9 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
     def get_output_embeddings(self):
         return self.model.decoder.embed_tokens
 
+    def set_output_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
 
 class SinusoidalPositionalEmbedding(nn.Embedding):
     """
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 2f1b4ed991..c51f19fae5 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -23,9 +23,8 @@ import torch
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.deepspeed import is_deepspeed_zero3_enabled
-
 from ...activations import ACT2FN
+from ...deepspeed import is_deepspeed_zero3_enabled
 from ...file_utils import (
     ModelOutput,
     add_start_docstrings,
@@ -853,17 +852,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         elif isinstance(module, nn.Conv1d):
-            if is_deepspeed_zero3_enabled():
-                import deepspeed
-
-                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
-                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
-                        nn.init.kaiming_normal_(module.weight.data)
-                else:
-                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
-                        nn.init.kaiming_normal_(module.weight.data)
-            else:
-                nn.init.kaiming_normal_(module.weight.data)
+            nn.init.kaiming_normal_(module.weight.data)
 
         if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
             module.bias.data.zero_()
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 487e178b2b..dcb3aa6d0b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -799,10 +799,7 @@ class TrainingArguments:
             device = torch.device("cuda", self.local_rank)
             self._n_gpu = 1
         elif self.deepspeed:
-            # deepspeed performs its own DDP internally, and requires the program to be started with:
-            # deepspeed  ./program.py
-            # rather than:
-            # python -m torch.distributed.launch --nproc_per_node=2 ./program.py
+            # deepspeed inits torch.distributed internally
             from .deepspeed import is_deepspeed_available
 
             if not is_deepspeed_available():
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index e699b110f0..6c5fe60c47 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -37,11 +37,12 @@ from transformers.testing_utils import (
     require_torch_multi_gpu,
     slow,
 )
-from transformers.trainer_utils import set_seed
+from transformers.trainer_utils import get_last_checkpoint, set_seed
 
 
-bindir = os.path.abspath(os.path.dirname(__file__))
-with ExtendSysPath(f"{bindir}/.."):
+tests_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+root_dir = os.path.dirname(tests_dir)
+with ExtendSysPath(tests_dir):
     from test_trainer import TrainerIntegrationCommon  # noqa
 
     if is_torch_available():
@@ -49,9 +50,10 @@ with ExtendSysPath(f"{bindir}/.."):
 
 
 set_seed(42)
-MBART_TINY = "sshleifer/tiny-mbart"
+
 T5_SMALL = "t5-small"
 T5_TINY = "patrickvonplaten/t5-tiny-random"
+GPT2_TINY = "sshleifer/tiny-gpt2"
 
 
 def load_json(path):
@@ -77,8 +79,19 @@ def require_deepspeed_aio(test_case):
 
 if is_deepspeed_available():
     from deepspeed.utils import logger as deepspeed_logger  # noqa
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
     from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
 
+
+def get_launcher(distributed=False):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    # 2. for now testing with just 2 gpus max (since some quality tests may give different
+    # results with mode gpus because we use very little data)
+    num_gpus = min(2, get_gpu_count()) if distributed else 1
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+
+
 ZERO2 = "zero2"
 ZERO3 = "zero3"
 stages = [ZERO2, ZERO3]
@@ -568,6 +581,41 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
             self.assertEqual(b, b1)
             self.check_trainer_state_are_the_same(state, state1)
 
+    @parameterized.expand(stages)
+    def test_load_state_dict_from_zero_checkpoint(self, stage):
+        # test that we can load fp32 weights directly from the zero checkpoint into the current model
+
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False, before=False)
+
+        ds_config_dict = self.get_config_dict(stage)
+
+        kwargs = dict(
+            output_dir=output_dir,
+            train_len=4,
+            per_device_train_batch_size=4,
+            num_train_epochs=1,
+            save_strategy="steps",
+            save_steps=1,
+            learning_rate=0.1,
+            fp16=True,
+            deepspeed=ds_config_dict,
+        )
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint_dir = get_last_checkpoint(output_dir)
+            model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+            (a1, b1) = model.a.item(), model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
     def test_config_object(self):
         # test that we can switch from zero2 to zero3 in the same process for example
         # test is_zero, etc.
@@ -809,7 +857,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
         ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
-        launcher = self.get_launcher(distributed)
+        launcher = get_launcher(distributed)
 
         cmd = launcher + script + args + ds_args
         # keep for quick debug
@@ -826,7 +874,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         data_dir = self.tests_dir / "fixtures"
         output_dir = self.get_auto_remove_tmp_dir()
         args = f"""
-            --model_name_or_path sshleifer/tiny-gpt2
+            --model_name_or_path {GPT2_TINY}
             --train_file {data_dir}/sample_text.txt
             --validation_file {data_dir}/sample_text.txt
             --output_dir {output_dir}
@@ -846,7 +894,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
         ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
-        launcher = self.get_launcher(distributed=True)
+        launcher = get_launcher(distributed=True)
 
         cmd = launcher + script + args + ds_args
         # keep for quick debug
@@ -860,7 +908,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         output_dir = self.get_auto_remove_tmp_dir()
         args = f"""
             --model_type gpt2
-            --tokenizer_name sshleifer/tiny-gpt2
+            --tokenizer_name {GPT2_TINY}
             --train_file {data_dir}/sample_text.txt
             --validation_file {data_dir}/sample_text.txt
             --output_dir {output_dir}
@@ -877,7 +925,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
         ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
         script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
-        launcher = self.get_launcher(distributed=True)
+        launcher = get_launcher(distributed=True)
 
         cmd = launcher + script + args + ds_args
         # keep for quick debug
@@ -885,11 +933,3 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         with CaptureStderr() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
         assert "Detected DeepSpeed ZeRO-3" in cs.err
-
-    def get_launcher(self, distributed=False):
-        # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
-        # - it won't be able to handle that
-        # 2. for now testing with just 2 gpus max (since some quality tests may give different
-        # results with mode gpus because we use very little data)
-        num_gpus = min(2, get_gpu_count()) if distributed else 1
-        return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
new file mode 100644
index 0000000000..89548b7acc
--- /dev/null
+++ b/tests/deepspeed/test_model_zoo.py
@@ -0,0 +1,259 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import os
+import subprocess
+
+from parameterized import parameterized
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    ExtendSysPath,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    require_deepspeed,
+    require_torch_gpu,
+    slow,
+)
+from transformers.trainer_utils import set_seed
+
+
+tests_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+root_dir = os.path.dirname(tests_dir)
+with ExtendSysPath(tests_dir):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+
+    if is_torch_available():
+        from test_trainer import RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer  # noqa
+
+
+set_seed(42)
+
+# translation
+FSMT_TINY = "stas/tiny-wmt19-en-de"
+BART_TINY = "sshleifer/bart-tiny-random"
+T5_SMALL = "t5-small"
+T5_TINY = "patrickvonplaten/t5-tiny-random"
+MBART_TINY = "sshleifer/tiny-mbart"
+MARIAN_TINY = "sshleifer/tiny-marian-en-de"
+
+# summarization
+PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random"
+
+# causal lm
+GPT2_TINY = "sshleifer/tiny-gpt2"
+XLM_ROBERTA_TINY = "hf-internal-testing/tiny-xlm-roberta"
+
+# question-answering
+ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"
+
+# masked lm
+DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased"
+ELECTRA_TINY = "hf-internal-testing/tiny-electra"
+
+# classification
+XLNET_TINY = "sshleifer/tiny-xlnet-base-cased"
+BERT_TINY = "hf-internal-testing/tiny-bert"
+
+
+# TODO: to add:
+# albert
+# deberta
+# funnel
+# longformer
+# dpr
+# gpt_neo
+# camembert
+# deberta-v2
+# m2m_100
+# tapas
+# vit
+# big_bird
+
+
+def get_launcher(distributed=False):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    # 2. for now testing with just 2 gpus max (since some quality tests may give different
+    # results with mode gpus because we use very little data)
+    num_gpus = min(2, get_gpu_count()) if distributed else 1
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+
+
+def make_task_cmds():
+    data_dir_fixtures = f"{tests_dir}/fixtures"
+    data_dir_samples = f"{data_dir_fixtures}/tests_samples"
+    data_dir_wmt = f"{data_dir_samples}/wmt_en_ro"
+    data_dir_xsum = f"{data_dir_samples}/xsum"
+    args_main = """
+        --do_train
+        --max_train_samples 4
+        --per_device_train_batch_size 2
+        --num_train_epochs 1
+        --fp16
+        --report_to none
+        --overwrite_output_dir
+        """.split()
+
+    # XXX: try to cover as many models as possible once (it's enough to run on one task per model)
+    # but need a tiny model for each
+    #
+    # should have T5_TINY, etc. global var defined
+    tasks2models = dict(
+        trans=[
+            "bart",
+            "fsmt",
+            "marian",
+            "mbart",
+            "t5",
+        ],
+        sum=[
+            "pegasus",
+        ],
+        clm=[
+            "gpt2",
+            "xlm-roberta",
+        ],
+        mlm=[
+            "electra",
+            "distilbert",
+        ],
+        qa=[
+            "roberta",
+        ],
+        clas=[
+            "bert",
+            "xlnet",
+        ],
+    )
+
+    scripts_dir = f"{root_dir}/examples/pytorch"
+
+    tasks = dict(
+        trans=f"""
+        {scripts_dir}/translation/run_translation.py
+        --train_file {data_dir_wmt}/train.json
+        --source_lang en
+        --target_lang ro
+        """,
+        sum=f"""
+        {scripts_dir}/summarization/run_summarization.py
+        --train_file {data_dir_xsum}/sample.json
+        --max_source_length 12
+        --max_target_length 12
+        """,
+        clm=f"""
+        {scripts_dir}/language-modeling/run_clm.py
+        --train_file {data_dir_fixtures}/sample_text.txt
+        --block_size 8
+        """,
+        mlm=f"""
+        {scripts_dir}/language-modeling/run_mlm.py
+        --train_file {data_dir_fixtures}/sample_text.txt
+        """,
+        qa=f"""
+        {scripts_dir}/question-answering/run_qa.py
+        --train_file {data_dir_samples}/SQUAD/sample.json
+        """,
+        clas=f"""
+        {scripts_dir}/text-classification/run_glue.py
+        --train_file {data_dir_samples}/MRPC/train.csv
+        --max_seq_length 12
+        --task_name MRPC
+        """,
+    )
+
+    launcher = get_launcher(distributed=True)
+
+    cmds = {}
+    for task, args in tasks.items():
+        args = args.split()
+        for model in tasks2models[task]:
+            model_name = globals()[f"{model.upper().replace('-', '_')}_TINY"]
+            args_model = f"--model_name_or_path {model_name}".split()
+            cmds[f"{task}_{model}"] = launcher + args + args_model + args_main
+
+            # # generation special case
+            # if task == "gen":
+            #     launcher = f"deepspeed --num_nodes 1 --num_gpus 1".split()
+            #     args_model += f"--model_type {model}".split()
+            #     cmds[f"{task}_{model}"] = launcher + args + args_model
+            # else:
+
+    return cmds
+
+
+task_cmds = make_task_cmds()
+
+ZERO2 = "zero2"
+ZERO3 = "zero3"
+stages = [ZERO2, ZERO3]
+
+
+def parameterized_custom_name_func(func, param_num, param):
+    # customize the test name generator function as we want both params to appear in the sub-test
+    # name, as by default it shows only the first param
+    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
+    return f"{func.__name__}_{param_based_name}"
+
+
+# Cartesian-product of zero stages with models to test
+params = list(itertools.product(stages, task_cmds.keys()))
+
+
+@slow
+@require_deepspeed
+@require_torch_gpu
+class TestDeepSpeedModelZoo(TestCasePlus):
+    """This class is for testing via an external script - can do multiple gpus"""
+
+    def get_task_cmd(self, task, stage):
+        # return a ready to run train cmd
+        if task not in task_cmds:
+            raise ValueError(f"don't know of task {task}, have {task_cmds.keys()}")
+
+        cmd = task_cmds[task]
+        args_ds = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+
+        output_dir = self.get_auto_remove_tmp_dir()
+        args_out = f"--output_dir {output_dir}".split()
+
+        cmd += args_ds + args_out
+
+        return cmd, output_dir
+
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_zero_to_fp32(self, stage, task):
+        # testing the ability to do a run followed by recovery of full fp32 weights
+
+        cmd, output_dir = self.get_task_cmd(task, stage)
+
+        # 1. generate the checkpoint
+        cmd += "--save_steps 1".split()
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] + cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # 2. test that the fp32 weights get reconsolidated
+        chkpt_dir = f"{output_dir}/checkpoint-1"
+        recovered_model_path = f"{chkpt_dir}/out.bin"
+        cmd = f"{chkpt_dir}/zero_to_fp32.py {chkpt_dir} {recovered_model_path}"
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        subprocess.check_call(cmd, shell=True)
+        assert os.path.exists(recovered_model_path), f"{recovered_model_path} was not found"
+
+        # possibly could also test that the resulting saved model is usable but given that we use
+        # random models we won't know if it's any good