From 143289dcf759a663c03317e30167e89ee6d86588 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 4 Jan 2021 12:09:12 -0800 Subject: [PATCH] [test_model_parallelization] multiple fixes (#9354) --- tests/test_modeling_common.py | 46 ++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 2b72056653..e33efd34b4 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -14,6 +14,7 @@ # limitations under the License. import copy +import gc import inspect import os.path import random @@ -1081,15 +1082,15 @@ class ModelTesterMixin: if not self.test_model_parallel: return - import subprocess - + # a candidate for testing_utils def get_current_gpu_memory_use(): - run_process = subprocess.Popen( - "nvidia-smi --query-gpu=memory.used --format=csv,nounits,noheader", shell=True, stdout=subprocess.PIPE - ) + """ returns a list of cuda memory allocations per GPU in MBs""" + + per_device_memory = [] + for id in range(torch.cuda.device_count()): + with torch.cuda.device(id): + per_device_memory.append(torch.cuda.memory_allocated() >> 20) - memory_usage = run_process.stdout.read().decode("utf-8").strip() - per_device_memory = [int(memory) for memory in memory_usage.split("\n")] return per_device_memory # Needs a large model to see the difference. @@ -1098,39 +1099,44 @@ class ModelTesterMixin: for model_class in self.all_parallelizable_model_classes: torch.cuda.empty_cache() - # Retrieve initial memory usage (should be close to 0) - initial_memory = get_current_gpu_memory_use() + # 1. single gpu memory load + unload + memory measurements + # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests) + memory_at_start = get_current_gpu_memory_use() - # Put model on device - model = model_class(config.from_pretrained("gpt2")) + # Put model on device 0 and take a memory snapshot + model = model_class(config) model.to("cuda:0") - - # Retrieve the memory after the model is put on the device memory_after_model_load = get_current_gpu_memory_use() + # The memory use on device 0 should be higher than it was initially. + self.assertGreater(memory_after_model_load[0], memory_at_start[0]) + del model + gc.collect() torch.cuda.empty_cache() - # The memory use on that device should be higher than it was initially. - self.assertGreater(memory_after_model_load[0], initial_memory[0]) + # 2. MP test + # it's essential to re-calibrate the usage before the next stage + memory_at_start = get_current_gpu_memory_use() # Spread model layers over multiple devices - model = model_class(config.from_pretrained("gpt2")) + model = model_class(config) model.parallelize() memory_after_parallelization = get_current_gpu_memory_use() # Assert that the memory use on all devices is higher than it was when loaded only on CPU for n in range(torch.cuda.device_count()): - self.assertGreater(memory_after_parallelization[n], initial_memory[n]) + self.assertGreater(memory_after_parallelization[n], memory_at_start[n]) - # Assert that the memory use of the first device is lower than it was when the entire model was loaded on it + # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it self.assertLess(memory_after_parallelization[0], memory_after_model_load[0]) - # Assert that the memory use of the second device is higher than it was when the entire model was loaded - # on the other device. + # Assert that the memory use of device 1 is higher than it was when the entire model was loaded + # on device 0 and device 1 wasn't used at all self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1]) del model + gc.collect() torch.cuda.empty_cache() @require_torch_multi_gpu