[bnb optim] fixing test (#21030)
* [bnb optim] fixing test * force 1 gpu * fix * fix * fix * finalize * improve commentary * fix * cleanup * more fixes
This commit is contained in:
@@ -1044,10 +1044,14 @@ class Trainer:
|
|||||||
|
|
||||||
manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
|
manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
|
||||||
|
|
||||||
|
skipped = 0
|
||||||
for module in opt_model.modules():
|
for module in opt_model.modules():
|
||||||
if isinstance(module, nn.Embedding):
|
if isinstance(module, nn.Embedding):
|
||||||
|
skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values())
|
||||||
|
print(f"skipped {module}: {skipped/2**20}M params")
|
||||||
manager.register_module_override(module, "weight", {"optim_bits": 32})
|
manager.register_module_override(module, "weight", {"optim_bits": 32})
|
||||||
logger.debug(f"bitsandbytes: will optimize {module} in fp32")
|
logger.debug(f"bitsandbytes: will optimize {module} in fp32")
|
||||||
|
print(f"skipped: {skipped/2**20}M params")
|
||||||
|
|
||||||
if is_sagemaker_mp_enabled():
|
if is_sagemaker_mp_enabled():
|
||||||
self.optimizer = smp.DistributedOptimizer(self.optimizer)
|
self.optimizer = smp.DistributedOptimizer(self.optimizer)
|
||||||
|
|||||||
@@ -17,11 +17,11 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
from transformers import AutoModel
|
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
CaptureStderr,
|
CaptureStderr,
|
||||||
ExtendSysPath,
|
ExtendSysPath,
|
||||||
@@ -207,96 +207,97 @@ class TestTrainerExt(TestCasePlus):
|
|||||||
from transformers.training_args import OptimizerNames
|
from transformers.training_args import OptimizerNames
|
||||||
|
|
||||||
def train_and_return_metrics(optim: str) -> Tuple[int, float]:
|
def train_and_return_metrics(optim: str) -> Tuple[int, float]:
|
||||||
from pathlib import Path
|
extra_args = "--skip_memory_metrics 0"
|
||||||
|
|
||||||
extra_args = (
|
|
||||||
f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
|
|
||||||
"False --adafactor False --log_level debug"
|
|
||||||
)
|
|
||||||
|
|
||||||
output_dir = self.run_trainer(
|
output_dir = self.run_trainer(
|
||||||
eval_steps=2,
|
|
||||||
max_len=128,
|
max_len=128,
|
||||||
model_name=MARIAN_MODEL,
|
model_name=MARIAN_MODEL,
|
||||||
learning_rate=3e-4,
|
learning_rate=3e-4,
|
||||||
num_train_epochs=1,
|
num_train_epochs=1,
|
||||||
|
optim=optim,
|
||||||
distributed=True, # force run in a new process
|
distributed=True, # force run in a new process
|
||||||
extra_args_str=extra_args,
|
extra_args_str=extra_args,
|
||||||
do_eval=False,
|
do_eval=False,
|
||||||
do_predict=False,
|
do_predict=False,
|
||||||
|
n_gpus_to_use=1, # to allow deterministic fixed memory usage
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check metrics
|
# Check metrics
|
||||||
logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
|
logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
|
||||||
gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
|
gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
|
||||||
gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]
|
gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
|
||||||
|
|
||||||
loss = logs[0]["train_loss"]
|
loss = logs[0]["train_loss"]
|
||||||
return gpu_peak_mem, gpu_alloc_mem, loss
|
return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
|
||||||
|
|
||||||
gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
|
gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
|
||||||
gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
|
gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
|
||||||
|
|
||||||
gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
|
gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
|
||||||
gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb
|
|
||||||
|
|
||||||
gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
|
gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
|
||||||
gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
|
gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
|
||||||
|
gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
|
||||||
|
|
||||||
gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
|
# sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
|
||||||
gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb
|
# doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
|
||||||
|
# in 2 bytes and the diff in optim memory usage is derived as so:
|
||||||
|
#
|
||||||
|
# - normal 25*8=~200MB (8 bytes per param)
|
||||||
|
# - bnb 25*2= ~50MB (2 bytes per param)
|
||||||
|
#
|
||||||
|
# Thus we should expect ~150MB total memory saved.
|
||||||
|
#
|
||||||
|
# Peak memory should be the same - the total should be different by about that same margin
|
||||||
|
#
|
||||||
|
# After leaving a small margin to accommodate for differences between gpus let's check
|
||||||
|
# that we have at least 120MB in savings
|
||||||
|
expected_savings = 120
|
||||||
|
|
||||||
# leave this for now if CI gets very different results
|
# uncomment the following if this test starts failing - requires py38 for a new print feature
|
||||||
# print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
|
# gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
|
||||||
# print(f" {gpu_alloc_mem_bnb=:010d} {gpu_peak_mem_bnb=:010d} {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
|
# print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
|
||||||
# print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
|
# print(f" {gpu_alloc_mem_bnb=}MB {gpu_peak_mem_bnb=}MB {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
|
||||||
# print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
|
# print(f"{gpu_alloc_mem_diff=}MB")
|
||||||
# print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")
|
# print(f"{gpu_peak_mem_diff=}MB")
|
||||||
|
# print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
|
||||||
|
# print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
|
||||||
|
|
||||||
self.assertGreater(
|
self.assertGreater(
|
||||||
gpu_peak_mem_diff_percent,
|
gpu_alloc_mem_diff,
|
||||||
10, # basically a huge difference - got ~30x on my desktop
|
expected_savings,
|
||||||
"should use very little peak gpu memory with BNB, compared to without it"
|
"should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
|
||||||
f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
|
f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
|
||||||
|
f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertGreater(
|
self.assertGreater(
|
||||||
gpu_total_mem_diff_percent,
|
gpu_total_mem_diff,
|
||||||
0.20, # could easily be 0.50, but let's stay on the safe side
|
expected_savings,
|
||||||
"Using BNB should use less total GPU memory than without it"
|
"should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
|
||||||
f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
|
f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
|
||||||
|
f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
|
loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Additionally let's test that the absolute gpu memory difference is larger or about the
|
|
||||||
# same as the expected saving coming from BNB (6 bytes per param)
|
|
||||||
model = AutoModel.from_pretrained(MARIAN_MODEL)
|
|
||||||
total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
|
|
||||||
bnb_saved_bytes = total_numel * 6 # 324MB
|
|
||||||
|
|
||||||
self.assertGreater(
|
|
||||||
gpu_total_mem_diff_bytes,
|
|
||||||
bnb_saved_bytes * 0.8, # add a safety margin, if it saved slightly less
|
|
||||||
f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
|
|
||||||
f" {gpu_total_mem_diff_bytes}",
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_trainer(
|
def run_trainer(
|
||||||
self,
|
self,
|
||||||
eval_steps: int,
|
|
||||||
max_len: int,
|
max_len: int,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
num_train_epochs: int,
|
num_train_epochs: int,
|
||||||
learning_rate: float = 3e-3,
|
learning_rate: float = 3e-3,
|
||||||
|
optim: str = "adafactor",
|
||||||
distributed: bool = False,
|
distributed: bool = False,
|
||||||
extra_args_str: str = None,
|
extra_args_str: str = None,
|
||||||
|
eval_steps: int = 0,
|
||||||
predict_with_generate: bool = True,
|
predict_with_generate: bool = True,
|
||||||
do_train: bool = True,
|
do_train: bool = True,
|
||||||
do_eval: bool = True,
|
do_eval: bool = True,
|
||||||
do_predict: bool = True,
|
do_predict: bool = True,
|
||||||
|
n_gpus_to_use: int = None,
|
||||||
):
|
):
|
||||||
data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
|
data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
|
||||||
output_dir = self.get_auto_remove_tmp_dir()
|
output_dir = self.get_auto_remove_tmp_dir()
|
||||||
@@ -320,10 +321,9 @@ class TestTrainerExt(TestCasePlus):
|
|||||||
--save_steps {str(eval_steps)}
|
--save_steps {str(eval_steps)}
|
||||||
--group_by_length
|
--group_by_length
|
||||||
--label_smoothing_factor 0.1
|
--label_smoothing_factor 0.1
|
||||||
--adafactor
|
|
||||||
--target_lang ro_RO
|
--target_lang ro_RO
|
||||||
--source_lang en_XX
|
--source_lang en_XX
|
||||||
"""
|
""".split()
|
||||||
|
|
||||||
args_eval = f"""
|
args_eval = f"""
|
||||||
--do_eval
|
--do_eval
|
||||||
@@ -332,13 +332,13 @@ class TestTrainerExt(TestCasePlus):
|
|||||||
--val_max_target_length {max_len}
|
--val_max_target_length {max_len}
|
||||||
--evaluation_strategy steps
|
--evaluation_strategy steps
|
||||||
--eval_steps {str(eval_steps)}
|
--eval_steps {str(eval_steps)}
|
||||||
"""
|
""".split()
|
||||||
|
|
||||||
args_predict = """
|
args_predict = """
|
||||||
--do_predict
|
--do_predict
|
||||||
"""
|
""".split()
|
||||||
|
|
||||||
args = ""
|
args = []
|
||||||
if do_train:
|
if do_train:
|
||||||
args += args_train
|
args += args_train
|
||||||
|
|
||||||
@@ -349,19 +349,25 @@ class TestTrainerExt(TestCasePlus):
|
|||||||
args += args_predict
|
args += args_predict
|
||||||
|
|
||||||
if predict_with_generate:
|
if predict_with_generate:
|
||||||
args += "--predict_with_generate"
|
args += "--predict_with_generate".split()
|
||||||
|
|
||||||
args = args.split()
|
if do_train:
|
||||||
|
if optim == "adafactor":
|
||||||
|
args += "--adafactor".split()
|
||||||
|
else:
|
||||||
|
args += f"--optim {optim}".split()
|
||||||
|
|
||||||
if extra_args_str is not None:
|
if extra_args_str is not None:
|
||||||
args.extend(extra_args_str.split())
|
args += extra_args_str.split()
|
||||||
|
|
||||||
if distributed:
|
if distributed:
|
||||||
n_gpu = get_gpu_count()
|
|
||||||
|
if n_gpus_to_use is None:
|
||||||
|
n_gpus_to_use = get_gpu_count()
|
||||||
master_port = get_torch_dist_unique_port()
|
master_port = get_torch_dist_unique_port()
|
||||||
distributed_args = f"""
|
distributed_args = f"""
|
||||||
-m torch.distributed.launch
|
-m torch.distributed.launch
|
||||||
--nproc_per_node={n_gpu}
|
--nproc_per_node={n_gpus_to_use}
|
||||||
--master_port={master_port}
|
--master_port={master_port}
|
||||||
{self.examples_dir_str}/pytorch/translation/run_translation.py
|
{self.examples_dir_str}/pytorch/translation/run_translation.py
|
||||||
""".split()
|
""".split()
|
||||||
|
|||||||
Reference in New Issue
Block a user