support qwen2-vl (#32318)

* support-qwen2-vl * tidy * tidy * tidy * tidy * tidy * tidy * tidy * hyphen->underscore * make style * add-flash2-tipd * delete-tokenize=False * remove-image_processor-in-init-file * add-qwen2_vl-in-MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES * format-doct * support-Qwen2VLVisionConfig * remove-standardize_cache_format * fix-letter-varaibles * remove-torch-in-image-processor * remove-useless-docstring * fix-one-letter-varaible-name * change-block-name * default-quick-gelu-in-vision * remove-useless-doc * use-preimplemented-flash-forward * fix-doc * fix-image-processing-doc * fix-apply-rotary-embed * fix-flash-attn-sliding-window * refactor * remove-default_template * remove-reorder_cache * simple-get-rope_deltas * update-prepare_inputs_for_generation * update-attention-mask * update-rotary_seq_len * remove-state * kv_seq_length * remove-warning * _supports_static_cache * remove-legacy-cache * refactor * fix-replace * mrope-section-doc * code-quality * code-quality * polish-doc * fix-image-processing-test * update readme * Update qwen2_vl.md * fix-test * Update qwen2_vl.md * nit * processor-kwargs * hard-code-norm_layer * code-quality * discard-pixel-values-in-gen * fix-inconsistent-error-msg * unify-image-video * hidden_act * add-docstring * vision-encode-as-PreTrainedModel * pixel-to-target-dtype * update doc and low memoryvit * format * format * channel-foramt * fix vit_flashatt * format * inherit-Qwen2VLPreTrainedModel * simplify * format-test * remove-one-line-func-in-image-processing * avoid-one-line-reshape * simplify-rotary_seq_len * avoid-single-letter-variable * no-for-loop-sdpa * avoid-single-letter-variable * remove-one-line-reshape * remove-one-line-reshape * remove-no-rope-in-vit-logic * default-mrope * add-copied-from * more-docs-for-mrope * polish-doc * comment-and-link * polish-doc * single-letter-variables * simplify-image-processing * video->images * kv_seq_len-update * vision-rope-on-the-fly * vision-eager-attention * change-processor-order --------- Co-authored-by: baishuai <baishuai.bs@alibaba-inc.com> Co-authored-by: ShuaiBai623 <43326198+ShuaiBai623@users.noreply.github.com>
2024-08-26 21:16:44 +08:00
parent 8defc95df3
commit 19e6e80e10
22 changed files with 3784 additions and 44 deletions
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -43,6 +43,7 @@ SPECIAL_CASES_TO_ALLOW = {
    ],
    "Qwen2Config": ["use_sliding_window"],
    "Qwen2MoeConfig": ["use_sliding_window"],
+    "Qwen2VLConfig": ["use_sliding_window"],
    "Gemma2Config": ["tie_word_embeddings"],
    # used to compute the property `self.chunk_length`
    "EncodecConfig": ["overlap"],
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -70,6 +70,7 @@ PRIVATE_MODELS = [
    "UMT5Stack",
    "Pop2PianoStack",
    "Qwen2AudioEncoder",
+    "Qwen2VisionTransformerPretrainedModel",
    "SwitchTransformersStack",
    "TFDPRSpanPredictor",
    "MaskFormerSwinModel",
@@ -86,50 +87,54 @@ PRIVATE_MODELS = [

 # Update this list for models that are not tested with a comment explaining the reason it should not be.
 # Being in this list is an exception and should **not** be the rule.
-IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
-    # models to ignore for not tested
-    "RecurrentGemmaModel",  # Building part of bigger (tested) model.
-    "FuyuForCausalLM",  # Not tested fort now
-    "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
-    "InstructBlipVideoQFormerModel",  # Building part of bigger (tested) model.
-    "UMT5EncoderModel",  # Building part of bigger (tested) model.
-    "Blip2QFormerModel",  # Building part of bigger (tested) model.
-    "ErnieMForInformationExtraction",
-    "FastSpeech2ConformerHifiGan",  # Already tested by SpeechT5HifiGan (# Copied from)
-    "FastSpeech2ConformerWithHifiGan",  # Built with two smaller (tested) models.
-    "GraphormerDecoderHead",  # Building part of bigger (tested) model.
-    "JukeboxVQVAE",  # Building part of bigger (tested) model.
-    "JukeboxPrior",  # Building part of bigger (tested) model.
-    "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
-    "SegformerDecodeHead",  # Building part of bigger (tested) model.
-    "MgpstrModel",  # Building part of bigger (tested) model.
-    "BertLMHeadModel",  # Needs to be setup as decoder.
-    "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
-    "RealmBertModel",  # Building part of bigger (tested) model.
-    "RealmReader",  # Not regular model.
-    "RealmScorer",  # Not regular model.
-    "RealmForOpenQA",  # Not regular model.
-    "ReformerForMaskedLM",  # Needs to be setup as decoder.
-    "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
-    "TFRobertaForMultipleChoice",  # TODO: fix
-    "TFRobertaPreLayerNormForMultipleChoice",  # TODO: fix
-    "SeparableConv1D",  # Building part of bigger (tested) model.
-    "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
-    "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
-    "OPTDecoderWrapper",
-    "TFSegformerDecodeHead",  # Not a regular model.
-    "AltRobertaModel",  # Building part of bigger (tested) model.
-    "BlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
-    "TFBlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
-    "BridgeTowerTextModel",  # No need to test it as it is tested by BridgeTowerModel model.
-    "BridgeTowerVisionModel",  # No need to test it as it is tested by BridgeTowerModel model.
-    "BarkCausalModel",  # Building part of bigger (tested) model.
-    "BarkModel",  # Does not have a forward signature - generation tested with integration tests.
-    "SeamlessM4TTextToUnitModel",  # Building part of bigger (tested) model.
-    "SeamlessM4TCodeHifiGan",  # Building part of bigger (tested) model.
-    "SeamlessM4TTextToUnitForConditionalGeneration",  # Building part of bigger (tested) model.
-    "ChameleonVQVAE",  # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model
-]
+IGNORE_NON_TESTED = (
+    PRIVATE_MODELS.copy()
+    + [
+        # models to ignore for not tested
+        "RecurrentGemmaModel",  # Building part of bigger (tested) model.
+        "FuyuForCausalLM",  # Not tested fort now
+        "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
+        "InstructBlipVideoQFormerModel",  # Building part of bigger (tested) model.
+        "UMT5EncoderModel",  # Building part of bigger (tested) model.
+        "Blip2QFormerModel",  # Building part of bigger (tested) model.
+        "ErnieMForInformationExtraction",
+        "FastSpeech2ConformerHifiGan",  # Already tested by SpeechT5HifiGan (# Copied from)
+        "FastSpeech2ConformerWithHifiGan",  # Built with two smaller (tested) models.
+        "GraphormerDecoderHead",  # Building part of bigger (tested) model.
+        "JukeboxVQVAE",  # Building part of bigger (tested) model.
+        "JukeboxPrior",  # Building part of bigger (tested) model.
+        "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
+        "SegformerDecodeHead",  # Building part of bigger (tested) model.
+        "MgpstrModel",  # Building part of bigger (tested) model.
+        "BertLMHeadModel",  # Needs to be setup as decoder.
+        "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
+        "RealmBertModel",  # Building part of bigger (tested) model.
+        "RealmReader",  # Not regular model.
+        "RealmScorer",  # Not regular model.
+        "RealmForOpenQA",  # Not regular model.
+        "ReformerForMaskedLM",  # Needs to be setup as decoder.
+        "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
+        "TFRobertaForMultipleChoice",  # TODO: fix
+        "TFRobertaPreLayerNormForMultipleChoice",  # TODO: fix
+        "SeparableConv1D",  # Building part of bigger (tested) model.
+        "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
+        "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
+        "OPTDecoderWrapper",
+        "TFSegformerDecodeHead",  # Not a regular model.
+        "AltRobertaModel",  # Building part of bigger (tested) model.
+        "BlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
+        "TFBlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
+        "BridgeTowerTextModel",  # No need to test it as it is tested by BridgeTowerModel model.
+        "BridgeTowerVisionModel",  # No need to test it as it is tested by BridgeTowerModel model.
+        "BarkCausalModel",  # Building part of bigger (tested) model.
+        "BarkModel",  # Does not have a forward signature - generation tested with integration tests.
+        "SeamlessM4TTextToUnitModel",  # Building part of bigger (tested) model.
+        "SeamlessM4TCodeHifiGan",  # Building part of bigger (tested) model.
+        "SeamlessM4TTextToUnitForConditionalGeneration",  # Building part of bigger (tested) model.
+        "ChameleonVQVAE",  # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model
+        "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.
+    ]
+)

 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
 # trigger the common tests.