From 8343901263f5786714c3546724638665ac0d493b Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 3 Jun 2022 09:59:13 -0400 Subject: [PATCH] Fix all offload and MP tests (#17533) --- src/transformers/modeling_utils.py | 3 ++- tests/models/opt/test_modeling_opt.py | 2 +- tests/models/t5/test_modeling_t5.py | 2 ++ tests/test_modeling_common.py | 23 ++++------------------- 4 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 2eefce0cce..e18854d205 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -574,7 +574,6 @@ def _load_state_dict_into_meta_model( for param_name, param in state_dict.items(): # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if param_name not in loaded_state_dict_keys or param_name not in expected_keys: - print(param_name) continue if param_name.startswith(start_prefix): @@ -2124,6 +2123,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix if model._no_split_modules is None: raise ValueError(f"{model.__class__.__name__} does not support `device_map='auto'` yet.") no_split_modules = model._no_split_modules + # Make sure tied weights are tied before creating the device map. + model.tie_weights() device_map = infer_auto_device_map( model, no_split_module_classes=no_split_modules, dtype=torch_dtype, max_memory=max_memory ) diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py index 97f93b779b..8018d05f09 100644 --- a/tests/models/opt/test_modeling_opt.py +++ b/tests/models/opt/test_modeling_opt.py @@ -63,7 +63,7 @@ class OPTModelTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=2, + num_hidden_layers=5, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 4485e65eec..1fc1a0a766 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -515,6 +515,8 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): test_resize_embeddings = True test_model_parallel = True is_encoder_decoder = True + # The small T5 model needs higher percentages for CPU/MP tests + model_split_percents = [0.8, 0.9] def setUp(self): self.model_tester = T5ModelTester(self) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 9690c49358..927eccefd4 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -153,6 +153,7 @@ class ModelTesterMixin: test_model_parallel = False is_encoder_decoder = False has_attentions = True + model_split_percents = [0.5, 0.7, 0.9] def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) @@ -2217,12 +2218,7 @@ class ModelTesterMixin: @require_accelerate @require_torch_gpu def test_disk_offload(self): - if all([model_class._no_split_modules is None for model_class in self.all_model_classes]): - return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - if isinstance(getattr(config, "num_hidden_layers", None), int) and config.num_hidden_layers < 4: - config.num_hidden_layers = 4 for model_class in self.all_model_classes: if model_class._no_split_modules is None: @@ -2234,8 +2230,7 @@ class ModelTesterMixin: base_output = model(**inputs_dict) model_size = compute_module_sizes(model)[""] - # We test several splits of sizes to make sure it works. - max_size = int(0.4 * model_size) + max_size = int(self.model_split_percents[0] * model_size) with tempfile.TemporaryDirectory() as tmp_dir: model.cpu().save_pretrained(tmp_dir) @@ -2256,12 +2251,7 @@ class ModelTesterMixin: @require_accelerate @require_torch_gpu def test_cpu_offload(self): - if all([model_class._no_split_modules is None for model_class in self.all_model_classes]): - return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - if isinstance(getattr(config, "num_hidden_layers", None), int) and config.num_hidden_layers < 4: - config.num_hidden_layers = 4 for model_class in self.all_model_classes: if model_class._no_split_modules is None: @@ -2274,7 +2264,7 @@ class ModelTesterMixin: model_size = compute_module_sizes(model)[""] # We test several splits of sizes to make sure it works. - max_gpu_sizes = [int(p * model_size) for p in [0.5, 0.7, 0.9]] + max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents] with tempfile.TemporaryDirectory() as tmp_dir: model.cpu().save_pretrained(tmp_dir) @@ -2292,12 +2282,7 @@ class ModelTesterMixin: @require_accelerate @require_torch_multi_gpu def test_model_parallelism(self): - if all([model_class._no_split_modules is None for model_class in self.all_model_classes]): - return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - if isinstance(getattr(config, "num_hidden_layers", None), int) and config.num_hidden_layers < 4: - config.num_hidden_layers = 4 for model_class in self.all_model_classes: if model_class._no_split_modules is None: @@ -2310,7 +2295,7 @@ class ModelTesterMixin: model_size = compute_module_sizes(model)[""] # We test several splits of sizes to make sure it works. - max_gpu_sizes = [int(p * model_size) for p in [0.5, 0.7, 0.9]] + max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents] with tempfile.TemporaryDirectory() as tmp_dir: model.cpu().save_pretrained(tmp_dir)