[trainer] implement support for full fp16 in evaluation/predict (#10268)
* implement --fp16_full_eval * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * style * add test Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import dataclasses
|
||||
import gc
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
@@ -29,6 +30,7 @@ from transformers.testing_utils import (
|
||||
require_sentencepiece,
|
||||
require_tokenizers,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
)
|
||||
@@ -912,6 +914,62 @@ class TrainerIntegrationTest(unittest.TestCase):
|
||||
trainer = get_regression_trainer(skip_memory_metrics=True)
|
||||
self.check_mem_metrics(trainer, self.assertNotIn)
|
||||
|
||||
@require_torch_gpu
|
||||
def test_fp16_full_eval(self):
|
||||
|
||||
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
|
||||
# it's using pretty large safety margins, but small enough to detect broken functionality.
|
||||
debug = 0
|
||||
|
||||
bs = 8
|
||||
# make the params somewhat big so that there will be enough RAM consumed to be able to
|
||||
# measure things. We should get about 64KB for a+b in fp32
|
||||
a = torch.ones(1000, bs) + 0.001
|
||||
b = torch.ones(1000, bs) - 0.001
|
||||
|
||||
# 1. with mem metrics enabled
|
||||
trainer = get_regression_trainer(a=a, b=b, eval_len=16)
|
||||
metrics = trainer.evaluate()
|
||||
del trainer
|
||||
gc.collect()
|
||||
|
||||
fp32_init = metrics["init_mem_gpu_alloc_delta"]
|
||||
fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
|
||||
|
||||
if debug:
|
||||
print(f"fp32_init {fp32_init}")
|
||||
print(f"fp32_eval {fp32_eval}")
|
||||
|
||||
# here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
|
||||
# perfect world: fp32_init == 64<<10
|
||||
self.assertGreater(fp32_init, 59_000)
|
||||
# after eval should be no extra memory allocated - with a small margin (other than the peak
|
||||
# memory consumption for the forward calculation that gets recovered)
|
||||
# perfect world: fp32_eval == close to zero
|
||||
self.assertLess(fp32_eval, 5_000)
|
||||
|
||||
# 2. with mem metrics disabled
|
||||
trainer = get_regression_trainer(a=a, b=b, eval_len=16, fp16_full_eval=True)
|
||||
metrics = trainer.evaluate()
|
||||
fp16_init = metrics["init_mem_gpu_alloc_delta"]
|
||||
fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
|
||||
|
||||
if debug:
|
||||
print(f"fp16_init {fp16_init}")
|
||||
print(f"fp16_eval {fp16_eval}")
|
||||
|
||||
# here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
|
||||
# perfect world: fp16_init == close to zero
|
||||
self.assertLess(fp16_init, 5_000)
|
||||
# here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
|
||||
# perfect world: fp32_init == 32<<10
|
||||
self.assertGreater(fp16_eval, 27_000)
|
||||
|
||||
# 3. relative comparison fp32 vs full fp16
|
||||
# should be about half of fp16_init
|
||||
# perfect world: fp32_init/2 == fp16_eval
|
||||
self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_optuna
|
||||
|
||||
Reference in New Issue
Block a user