[core/ GC / tests] Stronger GC tests (#27124)

* stronger GC tests

* better tests and skip failing tests

* break down into 3 sub-tests

* break down into 3 sub-tests

* refactor a bit

* more refactor

* fix

* last nit

* credits contrib and suggestions

* credits contrib and suggestions

---------

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Younes Belkada
2023-10-30 19:53:46 +01:00
committed by GitHub
parent 5bbf671276
commit f7ea959b96
51 changed files with 1068 additions and 26 deletions

View File

@@ -539,6 +539,44 @@ class ModelTesterMixin:
expected_arg_names = ["input_ids"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
if not self.model_tester.is_training:
return
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
if (
model_class.__name__
in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
or not model_class.supports_gradient_checkpointing
):
continue
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
model.to(torch_device)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
model.train()
# unfreeze additional layers
for p in model.parameters():
p.requires_grad_(True)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
optimizer.step()
for k, v in model.named_parameters():
if v.requires_grad:
self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
def test_training(self):
if not self.model_tester.is_training:
return
@@ -561,34 +599,18 @@ class ModelTesterMixin:
loss.backward()
def test_training_gradient_checkpointing(self):
if not self.model_tester.is_training:
return
# Scenario - 1 default behaviour
self.check_training_gradient_checkpointing()
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
def test_training_gradient_checkpointing_use_reentrant(self):
# Scenario - 2 with `use_reentrant=True` - this is the default value that is used in pytorch's
# torch.utils.checkpoint.checkpoint
self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True})
if (
model_class.__name__
in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
or not model_class.supports_gradient_checkpointing
):
continue
model = model_class(config)
model.to(torch_device)
model.gradient_checkpointing_enable()
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
model.gradient_checkpointing_disable()
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def test_training_gradient_checkpointing_use_reentrant_false(self):
# Scenario - 3 with `use_reentrant=False` pytorch suggests users to use this value for
# future releases: https://pytorch.org/docs/stable/checkpoint.html
self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": False})
def test_attention_outputs(self):
if not self.has_attentions: