support qwen2-vl (#32318)
* support-qwen2-vl * tidy * tidy * tidy * tidy * tidy * tidy * tidy * hyphen->underscore * make style * add-flash2-tipd * delete-tokenize=False * remove-image_processor-in-init-file * add-qwen2_vl-in-MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES * format-doct * support-Qwen2VLVisionConfig * remove-standardize_cache_format * fix-letter-varaibles * remove-torch-in-image-processor * remove-useless-docstring * fix-one-letter-varaible-name * change-block-name * default-quick-gelu-in-vision * remove-useless-doc * use-preimplemented-flash-forward * fix-doc * fix-image-processing-doc * fix-apply-rotary-embed * fix-flash-attn-sliding-window * refactor * remove-default_template * remove-reorder_cache * simple-get-rope_deltas * update-prepare_inputs_for_generation * update-attention-mask * update-rotary_seq_len * remove-state * kv_seq_length * remove-warning * _supports_static_cache * remove-legacy-cache * refactor * fix-replace * mrope-section-doc * code-quality * code-quality * polish-doc * fix-image-processing-test * update readme * Update qwen2_vl.md * fix-test * Update qwen2_vl.md * nit * processor-kwargs * hard-code-norm_layer * code-quality * discard-pixel-values-in-gen * fix-inconsistent-error-msg * unify-image-video * hidden_act * add-docstring * vision-encode-as-PreTrainedModel * pixel-to-target-dtype * update doc and low memoryvit * format * format * channel-foramt * fix vit_flashatt * format * inherit-Qwen2VLPreTrainedModel * simplify * format-test * remove-one-line-func-in-image-processing * avoid-one-line-reshape * simplify-rotary_seq_len * avoid-single-letter-variable * no-for-loop-sdpa * avoid-single-letter-variable * remove-one-line-reshape * remove-one-line-reshape * remove-no-rope-in-vit-logic * default-mrope * add-copied-from * more-docs-for-mrope * polish-doc * comment-and-link * polish-doc * single-letter-variables * simplify-image-processing * video->images * kv_seq_len-update * vision-rope-on-the-fly * vision-eager-attention * change-processor-order --------- Co-authored-by: baishuai <baishuai.bs@alibaba-inc.com> Co-authored-by: ShuaiBai623 <43326198+ShuaiBai623@users.noreply.github.com>
This commit is contained in:
@@ -43,6 +43,7 @@ SPECIAL_CASES_TO_ALLOW = {
|
||||
],
|
||||
"Qwen2Config": ["use_sliding_window"],
|
||||
"Qwen2MoeConfig": ["use_sliding_window"],
|
||||
"Qwen2VLConfig": ["use_sliding_window"],
|
||||
"Gemma2Config": ["tie_word_embeddings"],
|
||||
# used to compute the property `self.chunk_length`
|
||||
"EncodecConfig": ["overlap"],
|
||||
|
||||
@@ -70,6 +70,7 @@ PRIVATE_MODELS = [
|
||||
"UMT5Stack",
|
||||
"Pop2PianoStack",
|
||||
"Qwen2AudioEncoder",
|
||||
"Qwen2VisionTransformerPretrainedModel",
|
||||
"SwitchTransformersStack",
|
||||
"TFDPRSpanPredictor",
|
||||
"MaskFormerSwinModel",
|
||||
@@ -86,50 +87,54 @@ PRIVATE_MODELS = [
|
||||
|
||||
# Update this list for models that are not tested with a comment explaining the reason it should not be.
|
||||
# Being in this list is an exception and should **not** be the rule.
|
||||
IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
|
||||
# models to ignore for not tested
|
||||
"RecurrentGemmaModel", # Building part of bigger (tested) model.
|
||||
"FuyuForCausalLM", # Not tested fort now
|
||||
"InstructBlipQFormerModel", # Building part of bigger (tested) model.
|
||||
"InstructBlipVideoQFormerModel", # Building part of bigger (tested) model.
|
||||
"UMT5EncoderModel", # Building part of bigger (tested) model.
|
||||
"Blip2QFormerModel", # Building part of bigger (tested) model.
|
||||
"ErnieMForInformationExtraction",
|
||||
"FastSpeech2ConformerHifiGan", # Already tested by SpeechT5HifiGan (# Copied from)
|
||||
"FastSpeech2ConformerWithHifiGan", # Built with two smaller (tested) models.
|
||||
"GraphormerDecoderHead", # Building part of bigger (tested) model.
|
||||
"JukeboxVQVAE", # Building part of bigger (tested) model.
|
||||
"JukeboxPrior", # Building part of bigger (tested) model.
|
||||
"DecisionTransformerGPT2Model", # Building part of bigger (tested) model.
|
||||
"SegformerDecodeHead", # Building part of bigger (tested) model.
|
||||
"MgpstrModel", # Building part of bigger (tested) model.
|
||||
"BertLMHeadModel", # Needs to be setup as decoder.
|
||||
"MegatronBertLMHeadModel", # Building part of bigger (tested) model.
|
||||
"RealmBertModel", # Building part of bigger (tested) model.
|
||||
"RealmReader", # Not regular model.
|
||||
"RealmScorer", # Not regular model.
|
||||
"RealmForOpenQA", # Not regular model.
|
||||
"ReformerForMaskedLM", # Needs to be setup as decoder.
|
||||
"TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
|
||||
"TFRobertaForMultipleChoice", # TODO: fix
|
||||
"TFRobertaPreLayerNormForMultipleChoice", # TODO: fix
|
||||
"SeparableConv1D", # Building part of bigger (tested) model.
|
||||
"FlaxBartForCausalLM", # Building part of bigger (tested) model.
|
||||
"FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
|
||||
"OPTDecoderWrapper",
|
||||
"TFSegformerDecodeHead", # Not a regular model.
|
||||
"AltRobertaModel", # Building part of bigger (tested) model.
|
||||
"BlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models
|
||||
"TFBlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models
|
||||
"BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model.
|
||||
"BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model.
|
||||
"BarkCausalModel", # Building part of bigger (tested) model.
|
||||
"BarkModel", # Does not have a forward signature - generation tested with integration tests.
|
||||
"SeamlessM4TTextToUnitModel", # Building part of bigger (tested) model.
|
||||
"SeamlessM4TCodeHifiGan", # Building part of bigger (tested) model.
|
||||
"SeamlessM4TTextToUnitForConditionalGeneration", # Building part of bigger (tested) model.
|
||||
"ChameleonVQVAE", # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model
|
||||
]
|
||||
IGNORE_NON_TESTED = (
|
||||
PRIVATE_MODELS.copy()
|
||||
+ [
|
||||
# models to ignore for not tested
|
||||
"RecurrentGemmaModel", # Building part of bigger (tested) model.
|
||||
"FuyuForCausalLM", # Not tested fort now
|
||||
"InstructBlipQFormerModel", # Building part of bigger (tested) model.
|
||||
"InstructBlipVideoQFormerModel", # Building part of bigger (tested) model.
|
||||
"UMT5EncoderModel", # Building part of bigger (tested) model.
|
||||
"Blip2QFormerModel", # Building part of bigger (tested) model.
|
||||
"ErnieMForInformationExtraction",
|
||||
"FastSpeech2ConformerHifiGan", # Already tested by SpeechT5HifiGan (# Copied from)
|
||||
"FastSpeech2ConformerWithHifiGan", # Built with two smaller (tested) models.
|
||||
"GraphormerDecoderHead", # Building part of bigger (tested) model.
|
||||
"JukeboxVQVAE", # Building part of bigger (tested) model.
|
||||
"JukeboxPrior", # Building part of bigger (tested) model.
|
||||
"DecisionTransformerGPT2Model", # Building part of bigger (tested) model.
|
||||
"SegformerDecodeHead", # Building part of bigger (tested) model.
|
||||
"MgpstrModel", # Building part of bigger (tested) model.
|
||||
"BertLMHeadModel", # Needs to be setup as decoder.
|
||||
"MegatronBertLMHeadModel", # Building part of bigger (tested) model.
|
||||
"RealmBertModel", # Building part of bigger (tested) model.
|
||||
"RealmReader", # Not regular model.
|
||||
"RealmScorer", # Not regular model.
|
||||
"RealmForOpenQA", # Not regular model.
|
||||
"ReformerForMaskedLM", # Needs to be setup as decoder.
|
||||
"TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
|
||||
"TFRobertaForMultipleChoice", # TODO: fix
|
||||
"TFRobertaPreLayerNormForMultipleChoice", # TODO: fix
|
||||
"SeparableConv1D", # Building part of bigger (tested) model.
|
||||
"FlaxBartForCausalLM", # Building part of bigger (tested) model.
|
||||
"FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
|
||||
"OPTDecoderWrapper",
|
||||
"TFSegformerDecodeHead", # Not a regular model.
|
||||
"AltRobertaModel", # Building part of bigger (tested) model.
|
||||
"BlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models
|
||||
"TFBlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models
|
||||
"BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model.
|
||||
"BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model.
|
||||
"BarkCausalModel", # Building part of bigger (tested) model.
|
||||
"BarkModel", # Does not have a forward signature - generation tested with integration tests.
|
||||
"SeamlessM4TTextToUnitModel", # Building part of bigger (tested) model.
|
||||
"SeamlessM4TCodeHifiGan", # Building part of bigger (tested) model.
|
||||
"SeamlessM4TTextToUnitForConditionalGeneration", # Building part of bigger (tested) model.
|
||||
"ChameleonVQVAE", # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model
|
||||
"Qwen2VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.
|
||||
]
|
||||
)
|
||||
|
||||
# Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
|
||||
# trigger the common tests.
|
||||
|
||||
Reference in New Issue
Block a user