From 41b0564b35b2971ba2c0599938a3c4e00f78130f Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 12 Jan 2023 08:52:54 -0800
Subject: [PATCH] [bnb optim] fixing test (#21030)

* [bnb optim] fixing test

* force 1 gpu

* fix

* fix

* fix

* finalize

* improve commentary

* fix

* cleanup

* more fixes
---
 src/transformers/trainer.py        |   4 ++
 tests/extended/test_trainer_ext.py | 112 +++++++++++++++--------------
 2 files changed, 63 insertions(+), 53 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 93b998fc0c..d498e804d0 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1044,10 +1044,14 @@ class Trainer:
 
                     manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
 
+                    skipped = 0
                     for module in opt_model.modules():
                         if isinstance(module, nn.Embedding):
+                            skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values())
+                            print(f"skipped {module}: {skipped/2**20}M params")
                             manager.register_module_override(module, "weight", {"optim_bits": 32})
                             logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                    print(f"skipped: {skipped/2**20}M params")
 
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 64c244ae8e..d0fc558216 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -17,11 +17,11 @@ import os
 import re
 import sys
 import unittest
+from pathlib import Path
 from typing import Tuple
 from unittest.mock import patch
 
 from parameterized import parameterized
-from transformers import AutoModel
 from transformers.testing_utils import (
     CaptureStderr,
     ExtendSysPath,
@@ -207,96 +207,97 @@ class TestTrainerExt(TestCasePlus):
         from transformers.training_args import OptimizerNames
 
         def train_and_return_metrics(optim: str) -> Tuple[int, float]:
-            from pathlib import Path
-
-            extra_args = (
-                f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
-                "False --adafactor False --log_level debug"
-            )
+            extra_args = "--skip_memory_metrics 0"
 
             output_dir = self.run_trainer(
-                eval_steps=2,
                 max_len=128,
                 model_name=MARIAN_MODEL,
                 learning_rate=3e-4,
                 num_train_epochs=1,
+                optim=optim,
                 distributed=True,  # force run in a new process
                 extra_args_str=extra_args,
                 do_eval=False,
                 do_predict=False,
+                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
             )
 
             # Check metrics
             logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
-            gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
-            gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]
+            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
+            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
 
             loss = logs[0]["train_loss"]
-            return gpu_peak_mem, gpu_alloc_mem, loss
+            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
 
         gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
         gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
 
-        gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
-        gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb
+        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
 
         gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
         gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
+        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
 
-        gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
-        gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb
+        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
+        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
+        # in 2 bytes and the diff in optim memory usage is derived as so:
+        #
+        # - normal 25*8=~200MB (8 bytes per param)
+        # - bnb    25*2= ~50MB (2 bytes per param)
+        #
+        # Thus we should expect ~150MB total memory saved.
+        #
+        # Peak memory should be the same - the total should be different by about that same margin
+        #
+        # After leaving a small margin to accommodate for differences between gpus let's check
+        # that we have at least 120MB in savings
+        expected_savings = 120
 
-        # leave this for now if CI gets very different results
-        # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
-        # print(f" {gpu_alloc_mem_bnb=:010d}  {gpu_peak_mem_bnb=:010d}   {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
-        # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
-        # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
-        # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")
+        # uncomment the following if this test starts failing - requires py38 for a new print feature
+        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
+        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
+        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
+        # print(f"{gpu_alloc_mem_diff=}MB")
+        # print(f"{gpu_peak_mem_diff=}MB")
+        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
+        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
 
         self.assertGreater(
-            gpu_peak_mem_diff_percent,
-            10,  # basically a huge difference - got ~30x on my desktop
-            "should use very little peak gpu memory with BNB, compared to without it"
-            f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
+            gpu_alloc_mem_diff,
+            expected_savings,
+            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
+            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
+            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
         )
 
         self.assertGreater(
-            gpu_total_mem_diff_percent,
-            0.20,  # could easily be 0.50, but let's stay on the safe side
-            "Using BNB should use less total GPU memory than without it"
-            f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
+            gpu_total_mem_diff,
+            expected_savings,
+            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
+            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
+            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
         )
 
         self.assertEqual(
             loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
         )
 
-        # Additionally let's test that the absolute gpu memory difference is larger or about the
-        # same as the expected saving coming from BNB (6 bytes per param)
-        model = AutoModel.from_pretrained(MARIAN_MODEL)
-        total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        bnb_saved_bytes = total_numel * 6  # 324MB
-
-        self.assertGreater(
-            gpu_total_mem_diff_bytes,
-            bnb_saved_bytes * 0.8,  # add a safety margin, if it saved slightly less
-            f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
-            f" {gpu_total_mem_diff_bytes}",
-        )
-
     def run_trainer(
         self,
-        eval_steps: int,
         max_len: int,
         model_name: str,
         num_train_epochs: int,
         learning_rate: float = 3e-3,
+        optim: str = "adafactor",
         distributed: bool = False,
         extra_args_str: str = None,
+        eval_steps: int = 0,
         predict_with_generate: bool = True,
         do_train: bool = True,
         do_eval: bool = True,
         do_predict: bool = True,
+        n_gpus_to_use: int = None,
     ):
         data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
         output_dir = self.get_auto_remove_tmp_dir()
@@ -320,10 +321,9 @@ class TestTrainerExt(TestCasePlus):
             --save_steps {str(eval_steps)}
             --group_by_length
             --label_smoothing_factor 0.1
-            --adafactor
             --target_lang ro_RO
             --source_lang en_XX
-        """
+        """.split()
 
         args_eval = f"""
             --do_eval
@@ -332,13 +332,13 @@ class TestTrainerExt(TestCasePlus):
             --val_max_target_length {max_len}
             --evaluation_strategy steps
             --eval_steps {str(eval_steps)}
-        """
+        """.split()
 
         args_predict = """
             --do_predict
-        """
+        """.split()
 
-        args = ""
+        args = []
         if do_train:
             args += args_train
 
@@ -349,19 +349,25 @@ class TestTrainerExt(TestCasePlus):
             args += args_predict
 
         if predict_with_generate:
-            args += "--predict_with_generate"
+            args += "--predict_with_generate".split()
 
-        args = args.split()
+        if do_train:
+            if optim == "adafactor":
+                args += "--adafactor".split()
+            else:
+                args += f"--optim {optim}".split()
 
         if extra_args_str is not None:
-            args.extend(extra_args_str.split())
+            args += extra_args_str.split()
 
         if distributed:
-            n_gpu = get_gpu_count()
+
+            if n_gpus_to_use is None:
+                n_gpus_to_use = get_gpu_count()
             master_port = get_torch_dist_unique_port()
             distributed_args = f"""
                 -m torch.distributed.launch
-                --nproc_per_node={n_gpu}
+                --nproc_per_node={n_gpus_to_use}
                 --master_port={master_port}
                 {self.examples_dir_str}/pytorch/translation/run_translation.py
             """.split()