From bc764f42639d245114eaa077b4712aac5643603b Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 8 Mar 2024 10:06:46 +0000 Subject: [PATCH] Generate: left-padding test, revisited (#29515) * left-padding test revisited * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- tests/generation/test_utils.py | 87 +++++++++++-------- tests/models/bart/test_modeling_bart.py | 4 - .../test_modeling_bigbird_pegasus.py | 4 - .../blenderbot/test_modeling_blenderbot.py | 4 - .../test_modeling_blenderbot_small.py | 4 - tests/models/ctrl/test_modeling_ctrl.py | 4 - tests/models/marian/test_modeling_marian.py | 4 - tests/models/mbart/test_modeling_mbart.py | 4 - tests/models/mvp/test_modeling_mvp.py | 4 - tests/models/pegasus/test_modeling_pegasus.py | 4 - tests/models/plbart/test_modeling_plbart.py | 4 - .../prophetnet/test_modeling_prophetnet.py | 4 - tests/models/whisper/test_modeling_whisper.py | 4 - 13 files changed, 53 insertions(+), 82 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 8f7849ea97..425db5ecdc 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1833,49 +1833,68 @@ class GenerationTesterMixin: self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0) def test_left_padding_compatibility(self): - # The check done in this test is fairly difficult -- depending on the model architecture, passing the right - # position index for the position embeddings can still result in a different output, due to numerical masking. - # On the other hand, for some types of position embeddings, an incorrect position index can have a minimal - # impact on the output. - # There are two tricks employed to check whether left-padding compatibility is in place: - # 1 - To reduce the negative impact of the numerical attention mask on a correct position index, we set the - # padding size to 1. - # 2 - To reduce the chance of false positives (i.e. passing when it should be failing), we run the check - # multiple times with random inputs, and it has to pass with all of them. - # NOTE: because of 2), there is some chance of false positives in this test. + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] for model_class in self.all_generative_model_classes: config, _, _, _ = self._get_input_ids_and_config() if config.is_encoder_decoder: - continue # skip for encoder-decoder models -- they don't need left-padding compatibility + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, input_ids, attention_mask, _ = self._get_input_ids_and_config() model = model_class(config).to(torch_device).eval() signature = inspect.signature(model.forward).parameters.keys() - no_failures = True - for _ in range(10): # there may be false positives with 10 runs, we rely on the CI to catch the flakiness - _, input_ids, attention_mask, _ = self._get_input_ids_and_config() - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] - pad_size = (input_ids.shape[0], 1) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = {"input_ids": padded_input_ids, "attention_mask": padded_attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(padded_attention_mask, dim=-1) - 1 - position_ids.masked_fill_(padded_attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - if not torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-7): - no_failures = False - break + # With left-padding (length 32) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - self.assertTrue(no_failures) + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) def test_past_key_values_format(self): # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index 5e79de87c4..3804933735 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1527,7 +1527,3 @@ class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, un def test_save_load_fast_init_from_base(self): pass - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py index 90b71a7b82..96e7ce639f 100644 --- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py +++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py @@ -818,7 +818,3 @@ class BigBirdPegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTeste def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py index da7d8cc124..64ae71b24b 100644 --- a/tests/models/blenderbot/test_modeling_blenderbot.py +++ b/tests/models/blenderbot/test_modeling_blenderbot.py @@ -569,7 +569,3 @@ class BlenderbotStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMix def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py index 7bb45bdabd..39e953490f 100644 --- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py @@ -568,7 +568,3 @@ class BlenderbotSmallStandaloneDecoderModelTest(ModelTesterMixin, GenerationTest def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py index 13b3592611..71dcd02ed5 100644 --- a/tests/models/ctrl/test_modeling_ctrl.py +++ b/tests/models/ctrl/test_modeling_ctrl.py @@ -249,10 +249,6 @@ class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin model = CTRLModel.from_pretrained(model_name) self.assertIsNotNone(model) - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass - @require_torch class CTRLModelLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index 53a67c2045..593ef8e340 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -895,7 +895,3 @@ class MarianStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py index 3cabf7d999..93294d6568 100644 --- a/tests/models/mbart/test_modeling_mbart.py +++ b/tests/models/mbart/test_modeling_mbart.py @@ -736,7 +736,3 @@ class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, u def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py index 3e0a480237..225ea4a786 100644 --- a/tests/models/mvp/test_modeling_mvp.py +++ b/tests/models/mvp/test_modeling_mvp.py @@ -823,7 +823,3 @@ class MvpStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, uni def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py index fbf79650f4..dbc7c9de7b 100644 --- a/tests/models/pegasus/test_modeling_pegasus.py +++ b/tests/models/pegasus/test_modeling_pegasus.py @@ -596,7 +596,3 @@ class PegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py index 0d5274b018..998bd6c84f 100644 --- a/tests/models/plbart/test_modeling_plbart.py +++ b/tests/models/plbart/test_modeling_plbart.py @@ -669,7 +669,3 @@ class PLBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, def test_retain_grad_hidden_states_attentions(self): # decoder cannot keep gradients return - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py index eee03134d3..2f43a09320 100644 --- a/tests/models/prophetnet/test_modeling_prophetnet.py +++ b/tests/models/prophetnet/test_modeling_prophetnet.py @@ -1146,10 +1146,6 @@ class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMix # decoder cannot keep gradients return - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass - @require_torch class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase): diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index dc24a5bc34..db7c3ae82a 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -3230,7 +3230,3 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, @unittest.skip("The model doesn't support fast init from base") def test_save_load_fast_init_from_base(self): pass - - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass