From 41b0564b35b2971ba2c0599938a3c4e00f78130f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 12 Jan 2023 08:52:54 -0800 Subject: [PATCH] [bnb optim] fixing test (#21030) * [bnb optim] fixing test * force 1 gpu * fix * fix * fix * finalize * improve commentary * fix * cleanup * more fixes --- src/transformers/trainer.py | 4 ++ tests/extended/test_trainer_ext.py | 112 +++++++++++++++-------------- 2 files changed, 63 insertions(+), 53 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 93b998fc0c..d498e804d0 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1044,10 +1044,14 @@ class Trainer: manager = bitsandbytes.optim.GlobalOptimManager.get_instance() + skipped = 0 for module in opt_model.modules(): if isinstance(module, nn.Embedding): + skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values()) + print(f"skipped {module}: {skipped/2**20}M params") manager.register_module_override(module, "weight", {"optim_bits": 32}) logger.debug(f"bitsandbytes: will optimize {module} in fp32") + print(f"skipped: {skipped/2**20}M params") if is_sagemaker_mp_enabled(): self.optimizer = smp.DistributedOptimizer(self.optimizer) diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index 64c244ae8e..d0fc558216 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -17,11 +17,11 @@ import os import re import sys import unittest +from pathlib import Path from typing import Tuple from unittest.mock import patch from parameterized import parameterized -from transformers import AutoModel from transformers.testing_utils import ( CaptureStderr, ExtendSysPath, @@ -207,96 +207,97 @@ class TestTrainerExt(TestCasePlus): from transformers.training_args import OptimizerNames def train_and_return_metrics(optim: str) -> Tuple[int, float]: - from pathlib import Path - - extra_args = ( - f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict " - "False --adafactor False --log_level debug" - ) + extra_args = "--skip_memory_metrics 0" output_dir = self.run_trainer( - eval_steps=2, max_len=128, model_name=MARIAN_MODEL, learning_rate=3e-4, num_train_epochs=1, + optim=optim, distributed=True, # force run in a new process extra_args_str=extra_args, do_eval=False, do_predict=False, + n_gpus_to_use=1, # to allow deterministic fixed memory usage ) # Check metrics logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history - gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"] - gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"] + gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20) + gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20) loss = logs[0]["train_loss"] - return gpu_peak_mem, gpu_alloc_mem, loss + return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value) gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value) - gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb - gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb + gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb + gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb - gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb - gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb + # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which + # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized + # in 2 bytes and the diff in optim memory usage is derived as so: + # + # - normal 25*8=~200MB (8 bytes per param) + # - bnb 25*2= ~50MB (2 bytes per param) + # + # Thus we should expect ~150MB total memory saved. + # + # Peak memory should be the same - the total should be different by about that same margin + # + # After leaving a small margin to accommodate for differences between gpus let's check + # that we have at least 120MB in savings + expected_savings = 120 - # leave this for now if CI gets very different results - # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" ) - # print(f" {gpu_alloc_mem_bnb=:010d} {gpu_peak_mem_bnb=:010d} {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}") - # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}") - # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}") - # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}") + # uncomment the following if this test starts failing - requires py38 for a new print feature + # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb + # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB") + # print(f" {gpu_alloc_mem_bnb=}MB {gpu_peak_mem_bnb=}MB {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB") + # print(f"{gpu_alloc_mem_diff=}MB") + # print(f"{gpu_peak_mem_diff=}MB") + # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB") + # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB") self.assertGreater( - gpu_peak_mem_diff_percent, - 10, # basically a huge difference - got ~30x on my desktop - "should use very little peak gpu memory with BNB, compared to without it" - f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}", + gpu_alloc_mem_diff, + expected_savings, + "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got" + f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and" + f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB", ) self.assertGreater( - gpu_total_mem_diff_percent, - 0.20, # could easily be 0.50, but let's stay on the safe side - "Using BNB should use less total GPU memory than without it" - f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}", + gpu_total_mem_diff, + expected_savings, + "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got" + f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and" + f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB", ) self.assertEqual( loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}" ) - # Additionally let's test that the absolute gpu memory difference is larger or about the - # same as the expected saving coming from BNB (6 bytes per param) - model = AutoModel.from_pretrained(MARIAN_MODEL) - total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - bnb_saved_bytes = total_numel * 6 # 324MB - - self.assertGreater( - gpu_total_mem_diff_bytes, - bnb_saved_bytes * 0.8, # add a safety margin, if it saved slightly less - f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were" - f" {gpu_total_mem_diff_bytes}", - ) - def run_trainer( self, - eval_steps: int, max_len: int, model_name: str, num_train_epochs: int, learning_rate: float = 3e-3, + optim: str = "adafactor", distributed: bool = False, extra_args_str: str = None, + eval_steps: int = 0, predict_with_generate: bool = True, do_train: bool = True, do_eval: bool = True, do_predict: bool = True, + n_gpus_to_use: int = None, ): data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() @@ -320,10 +321,9 @@ class TestTrainerExt(TestCasePlus): --save_steps {str(eval_steps)} --group_by_length --label_smoothing_factor 0.1 - --adafactor --target_lang ro_RO --source_lang en_XX - """ + """.split() args_eval = f""" --do_eval @@ -332,13 +332,13 @@ class TestTrainerExt(TestCasePlus): --val_max_target_length {max_len} --evaluation_strategy steps --eval_steps {str(eval_steps)} - """ + """.split() args_predict = """ --do_predict - """ + """.split() - args = "" + args = [] if do_train: args += args_train @@ -349,19 +349,25 @@ class TestTrainerExt(TestCasePlus): args += args_predict if predict_with_generate: - args += "--predict_with_generate" + args += "--predict_with_generate".split() - args = args.split() + if do_train: + if optim == "adafactor": + args += "--adafactor".split() + else: + args += f"--optim {optim}".split() if extra_args_str is not None: - args.extend(extra_args_str.split()) + args += extra_args_str.split() if distributed: - n_gpu = get_gpu_count() + + if n_gpus_to_use is None: + n_gpus_to_use = get_gpu_count() master_port = get_torch_dist_unique_port() distributed_args = f""" -m torch.distributed.launch - --nproc_per_node={n_gpu} + --nproc_per_node={n_gpus_to_use} --master_port={master_port} {self.examples_dir_str}/pytorch/translation/run_translation.py """.split()