🔴 VLM: compile compatibility (#35724)

* llavas * add mroe models * fix `compile_forward` test for all models * fix copies * make style * also doesn't support cache class * fix some tests * not copied from * ci green? * fix tests * fix copies * fix tests * check with `numel` and remove `item` * fix copies * fix copies * Update src/transformers/models/cohere2/modeling_cohere2.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * opt remove cross attn * gemma2 * fixup * fixup * fix newly added test * maybe fixed? * green please? --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2025-02-14 15:23:49 +01:00
parent b45cf0e90a
commit 0c78ef6cd3
44 changed files with 464 additions and 1215 deletions
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@@ -286,10 +286,18 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
    def test_generate_from_inputs_embeds_1_beam_search(self):
        pass

-    @unittest.skip(reason="Unsupported")
+    @unittest.skip(reason="Dynamic control flow due to MoE")
    def test_generate_with_static_cache(self):
        pass

+    @unittest.skip(reason="Dynamic control flow due to MoE")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip(reason="Dynamic control flow due to MoE")
+    def test_generate_compile_model_forward(self):
+        pass
+

@require_torch
 class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -816,6 +816,10 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
    def test_generate_from_inputs_embeds(self, _, num_beams):
        pass

+    @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+

 # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
 class Blip2TextModelTester:
--- a/tests/models/emu3/test_modeling_emu3.py
+++ b/tests/models/emu3/test_modeling_emu3.py
@@ -386,10 +386,6 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
    def test_cpu_offload(self):
        pass

-    @unittest.skip("Doesn't work, tensors are not almost same")  # TODO raushan fixme
-    def test_custom_4d_attention_mask(self):
-        pass
-
    @unittest.skip("VQ-VAE module doesn't initialize weights properly")
    def test_initialization(self):
        pass
--- a/tests/models/got_ocr2/test_modeling_got_ocr2.py
+++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py
@@ -256,12 +256,6 @@ class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
    def test_past_key_values_format(self):
        pass

-    @unittest.skip(
-        reason="GotOcr2 needs a dynamic control flow to pass pixel values to the forward function only in the first generation step"
-    )
-    def test_generate_compile_1_end_to_end(self):
-        pass
-
    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
    def test_flash_attn_2_fp32_ln(self):
        pass
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -838,6 +838,14 @@ class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, uni
    def test_custom_4d_attention_mask(self):
        pass

+    @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs")
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs")
+    def test_generate_compile_model_forward(self):
+        pass
+
    @unittest.skip(reason="We only test the model that takes in multiple images")
    def test_model(self):
        pass
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -530,6 +530,12 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
    def test_save_load_fast_init_to_base(self):
        pass

+    @unittest.skip(
+        "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
+    )
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -546,6 +546,12 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
    def test_save_load_fast_init_to_base(self):
        pass

+    @unittest.skip(
+        "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
+    )
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -316,14 +316,6 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
    def test_flash_attn_2_fp32_ln(self):
        pass
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -365,22 +365,6 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="Feedforward chunking is not yet supported")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="CPU offload is not yet supported")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
    def test_flash_attn_2_fp32_ln(self):
        pass
@@ -391,6 +375,10 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
        pass

+    @unittest.skip("LLaVA Next has dynamic control flow in unpadding")
+    def test_generate_compile_model_forward(self):
+        pass
+

@require_torch
 class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -382,26 +382,6 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="Feedforward chunking is not yet supported")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="CPU offload is not yet supported")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(
-        reason="Compile not yet supported because in LLava models (https://github.com/huggingface/transformers/issues/29891)"
-    )
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(
-        reason="Compile not yet supported because in LLava models (https://github.com/huggingface/transformers/issues/29891)"
-    )
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
    def test_flash_attn_2_fp32_ln(self):
        pass
@@ -412,6 +392,10 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
        pass

+    @unittest.skip("LLaVA Next Video has dynamic control flow in unpadding")
+    def test_generate_compile_model_forward(self):
+        pass
+

@require_torch
 class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -346,6 +346,10 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
        pass

+    @unittest.skip("LLaVA OneVision has dynamic control flow in unpadding")
+    def test_generate_compile_model_forward(self):
+        pass
+

@require_torch
 class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase):
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -540,7 +540,6 @@ class MT5ModelTester:
            "attention_mask": attention_mask,
            "decoder_input_ids": decoder_input_ids,
            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
        }
        return config, inputs_dict

--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -81,7 +81,7 @@ class OPTModelTester:
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
+        max_position_embeddings=50,
        eos_token_id=2,
        pad_token_id=1,
        bos_token_id=0,
@@ -89,7 +89,6 @@ class OPTModelTester:
        num_labels=3,
        word_embed_proj_dim=16,
        type_sequence_label_size=2,
-        attn_implementation="eager",
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -113,7 +112,6 @@ class OPTModelTester:
        self.type_sequence_label_size = type_sequence_label_size
        self.word_embed_proj_dim = word_embed_proj_dim
        self.is_encoder_decoder = False
-        self.attn_implementation = attn_implementation

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
@@ -143,7 +141,6 @@ class OPTModelTester:
            embed_dim=self.embed_dim,
            is_encoder_decoder=False,
            word_embed_proj_dim=self.word_embed_proj_dim,
-            attn_implementation=self.attn_implementation,
        )

    def get_pipeline_config(self):
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -545,7 +545,6 @@ class T5ModelTester:
            "attention_mask": attention_mask,
            "decoder_input_ids": decoder_input_ids,
            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
        }
        return config, inputs_dict

--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -226,14 +226,6 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="Pass because video-LLava requires `attention_mask is not None`")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Pass because video-LLava requires `attention_mask is not None`")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
    def test_flash_attn_2_fp32_ln(self):
        pass
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -306,14 +306,6 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
    def test_training_gradient_checkpointing_use_reentrant_false(self):
        pass

-    @unittest.skip(reason="Compile not yet supported because it is not yet supported in LLava")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
    def test_flash_attn_2_fp32_ln(self):
        pass