[generation] bring back tests on vision models (#38603)
* bring back geenration tests on VLMs * remove head mask tests overwritten
This commit is contained in:
committed by
GitHub
parent
90c4b90a10
commit
dbfc79c17c
@@ -468,13 +468,6 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
|
||||
def test_load_save_without_tied_weights(self):
|
||||
pass
|
||||
|
||||
def test_generate_with_head_masking(self):
|
||||
# overwritten to temporarily switch the attention type to `original_full`
|
||||
original_self_attention_type = self.model_tester.attention_type
|
||||
self.model_tester.attention_type = "original_full"
|
||||
super().test_generate_with_head_masking()
|
||||
self.model_tester.attention_type = original_self_attention_type
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_sentencepiece
|
||||
|
||||
@@ -782,7 +782,7 @@ class BlipVQAModelTester:
|
||||
@require_vision
|
||||
class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
|
||||
# Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
|
||||
# Doesn't run generation tests due to custom generation logic -- won't fix
|
||||
all_generative_model_classes = ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
@@ -1091,7 +1091,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
@require_torch
|
||||
class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
# Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
|
||||
# Doesn't run generation tests due to custom generation logic -- wont fix
|
||||
all_generative_model_classes = ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
|
||||
@@ -774,6 +774,7 @@ class Blip2TextModelTester:
|
||||
bos_token_id=self.pad_token_id,
|
||||
pad_token_id=self.pad_token_id,
|
||||
decoder_start_token_id=self.decoder_start_token_id,
|
||||
is_encoder_decoder=True,
|
||||
)
|
||||
|
||||
|
||||
@@ -795,6 +796,9 @@ class Blip2ModelTester:
|
||||
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
|
||||
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||
self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests
|
||||
self.encoder_seq_length = (
|
||||
self.text_model_tester.encoder_seq_length + num_query_tokens
|
||||
) # need enc seq_length for gen tests
|
||||
self.is_training = is_training
|
||||
self.num_query_tokens = num_query_tokens
|
||||
|
||||
@@ -859,11 +863,9 @@ class Blip2ModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
|
||||
additional_model_inputs = ["input_ids", "decoder_input_ids"]
|
||||
# Doesn't run generation tests. TODO: fix generation tests for Blip2ForConditionalGeneration
|
||||
all_generative_model_classes = ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": Blip2Model,
|
||||
|
||||
@@ -324,10 +324,8 @@ class IdeficsModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
|
||||
# Doesn't run generation tests here -- idefics has a dedicated tester for generation tests below
|
||||
all_generative_model_classes = ()
|
||||
pipeline_model_mapping = (
|
||||
{"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
|
||||
if is_torch_available()
|
||||
@@ -336,6 +334,7 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
has_attentions = False # only supports SDOA and thus no attention probs returned
|
||||
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
@@ -494,6 +493,31 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
return
|
||||
|
||||
@pytest.mark.generate
|
||||
@unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
|
||||
def test_generate_without_input_ids(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
@unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
|
||||
def test_generate_continue_from_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
@unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
|
||||
def test_contrastive_generate(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
@unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
|
||||
def test_contrastive_generate_low_memory(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
@unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
|
||||
def test_contrastive_generate_dict_outputs_use_cache(self):
|
||||
pass
|
||||
|
||||
def test_attention_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
@@ -626,40 +626,6 @@ class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
||||
model = LongT5Model.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_generate_with_head_masking(self):
|
||||
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config = config_and_inputs[0]
|
||||
max_length = config_and_inputs[1].shape[-1] + 3
|
||||
model = LongT5ForConditionalGeneration(config).eval()
|
||||
model.to(torch_device)
|
||||
|
||||
head_masking = {
|
||||
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
|
||||
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
}
|
||||
|
||||
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
|
||||
head_masks = {name: mask}
|
||||
# Explicitly pass decoder_head_mask as it is required from LONGT5 model when head_mask specified
|
||||
if name == "head_mask":
|
||||
head_masks["decoder_head_mask"] = torch.ones(
|
||||
config.num_decoder_layers, config.num_heads, device=torch_device
|
||||
)
|
||||
|
||||
out = model.generate(
|
||||
config_and_inputs[1],
|
||||
num_beams=1,
|
||||
max_length=max_length,
|
||||
output_attentions=True,
|
||||
return_dict_in_generate=True,
|
||||
**head_masks,
|
||||
)
|
||||
# We check the state of decoder_attentions and cross_attentions just from the last step
|
||||
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
|
||||
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
|
||||
|
||||
def test_attention_outputs(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="has_attentions is set to False")
|
||||
|
||||
@@ -868,40 +868,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
model = MT5Model.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_generate_with_head_masking(self):
|
||||
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config = config_and_inputs[0]
|
||||
max_length = config_and_inputs[1].shape[-1] + 3
|
||||
model = MT5ForConditionalGeneration(config).eval()
|
||||
model.to(torch_device)
|
||||
|
||||
head_masking = {
|
||||
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
|
||||
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
}
|
||||
|
||||
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
|
||||
head_masks = {name: mask}
|
||||
# Explicitly pass decoder_head_mask as it is required from MT5 model when head_mask specified
|
||||
if name == "head_mask":
|
||||
head_masks["decoder_head_mask"] = torch.ones(
|
||||
config.num_decoder_layers, config.num_heads, device=torch_device
|
||||
)
|
||||
|
||||
out = model.generate(
|
||||
config_and_inputs[1],
|
||||
num_beams=1,
|
||||
max_length=max_length,
|
||||
output_attentions=True,
|
||||
return_dict_in_generate=True,
|
||||
**head_masks,
|
||||
)
|
||||
# We check the state of decoder_attentions and cross_attentions just from the last step
|
||||
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
|
||||
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
|
||||
|
||||
|
||||
# Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->MT5
|
||||
class MT5EncoderOnlyModelTester:
|
||||
|
||||
@@ -1117,10 +1117,6 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
self.assertIsNotNone(encoder_hidden_states.grad)
|
||||
self.assertIsNotNone(encoder_attentions.grad)
|
||||
|
||||
@unittest.skip(reason="Generating with head_masking has not been implemented for ProphetNet models yet.")
|
||||
def test_generate_with_head_masking(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
|
||||
@@ -741,10 +741,6 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase, Generatio
|
||||
if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
|
||||
module.masked_spec_embed.data.fill_(3)
|
||||
|
||||
@unittest.skip(reason="Temporarily broken") # TODO (joao, eustache): have a look at this test
|
||||
def test_generate_with_head_masking(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Temporarily broken") # TODO (joao, eustache): have a look at this test
|
||||
def test_generate_without_input_ids(self):
|
||||
pass
|
||||
|
||||
@@ -709,40 +709,6 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
|
||||
model = SwitchTransformersModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_generate_with_head_masking(self):
|
||||
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config = config_and_inputs[0]
|
||||
max_length = config_and_inputs[1].shape[-1] + 3
|
||||
model = SwitchTransformersForConditionalGeneration(config).eval()
|
||||
model.to(torch_device)
|
||||
|
||||
head_masking = {
|
||||
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
|
||||
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
}
|
||||
|
||||
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
|
||||
head_masks = {name: mask}
|
||||
# Explicitly pass decoder_head_mask as it is required from SWITCH_TRANSFORMERS model when head_mask specified
|
||||
if name == "head_mask":
|
||||
head_masks["decoder_head_mask"] = torch.ones(
|
||||
config.num_decoder_layers, config.num_heads, device=torch_device
|
||||
)
|
||||
|
||||
out = model.generate(
|
||||
config_and_inputs[1],
|
||||
num_beams=1,
|
||||
max_length=max_length,
|
||||
output_attentions=True,
|
||||
return_dict_in_generate=True,
|
||||
**head_masks,
|
||||
)
|
||||
# We check the state of decoder_attentions and cross_attentions just from the last step
|
||||
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
|
||||
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
|
||||
)
|
||||
|
||||
@@ -873,40 +873,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
model = T5Model.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_generate_with_head_masking(self):
|
||||
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config = config_and_inputs[0]
|
||||
max_length = config_and_inputs[1].shape[-1] + 3
|
||||
model = T5ForConditionalGeneration(config).eval()
|
||||
model.to(torch_device)
|
||||
|
||||
head_masking = {
|
||||
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
|
||||
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
}
|
||||
|
||||
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
|
||||
head_masks = {name: mask}
|
||||
# Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
|
||||
if name == "head_mask":
|
||||
head_masks["decoder_head_mask"] = torch.ones(
|
||||
config.num_decoder_layers, config.num_heads, device=torch_device
|
||||
)
|
||||
|
||||
out = model.generate(
|
||||
config_and_inputs[1],
|
||||
num_beams=1,
|
||||
max_length=max_length,
|
||||
output_attentions=True,
|
||||
return_dict_in_generate=True,
|
||||
**head_masks,
|
||||
)
|
||||
# We check the state of decoder_attentions and cross_attentions just from the last step
|
||||
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
|
||||
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
|
||||
|
||||
|
||||
class T5EncoderOnlyModelTester:
|
||||
def __init__(
|
||||
|
||||
@@ -419,10 +419,6 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
||||
model = UdopForConditionalGeneration.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@unittest.skip(reason="TODO: Fix me @joao")
|
||||
def test_generate_with_head_masking(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="TODO: Fix me @joao")
|
||||
def test_generate_without_input_ids(self):
|
||||
pass
|
||||
|
||||
@@ -489,39 +489,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
|
||||
|
||||
def test_generate_with_head_masking(self):
|
||||
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config = config_and_inputs[0]
|
||||
model = UMT5ForConditionalGeneration(config).eval()
|
||||
model.to(torch_device)
|
||||
|
||||
head_masking = {
|
||||
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
|
||||
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
|
||||
}
|
||||
|
||||
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
|
||||
head_masks = {name: mask}
|
||||
# Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
|
||||
if name == "head_mask":
|
||||
head_masks["decoder_head_mask"] = torch.ones(
|
||||
config.num_decoder_layers, config.num_heads, device=torch_device
|
||||
)
|
||||
|
||||
out = model.generate(
|
||||
config_and_inputs[1]["input_ids"],
|
||||
num_beams=1,
|
||||
max_length=3,
|
||||
output_attentions=True,
|
||||
return_dict_in_generate=True,
|
||||
**head_masks,
|
||||
)
|
||||
# We check the state of decoder_attentions and cross_attentions just from the last step
|
||||
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
|
||||
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user