Fix AutoConfig and AutoModel support for Llava-Next-Video (#32844)
* Fix: fix all model_type of Llava-Next-Video to llava_next_video * Fix doc for llava_next_video * * Fix formatting issues * Change llava-next-video.md file name into llava_next_video.md to make it compatible with implementation * Fix docs TOC for llava-next-video
This commit is contained in:
@@ -822,7 +822,7 @@
|
|||||||
title: Llava
|
title: Llava
|
||||||
- local: model_doc/llava_next
|
- local: model_doc/llava_next
|
||||||
title: LLaVA-NeXT
|
title: LLaVA-NeXT
|
||||||
- local: model_doc/llava-next-video
|
- local: model_doc/llava_next_video
|
||||||
title: LLaVa-NeXT-Video
|
title: LLaVa-NeXT-Video
|
||||||
- local: model_doc/lxmert
|
- local: model_doc/lxmert
|
||||||
title: LXMERT
|
title: LXMERT
|
||||||
|
|||||||
@@ -186,7 +186,7 @@ Flax), PyTorch, and/or TensorFlow.
|
|||||||
| [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ |
|
| [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ |
|
||||||
| [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ |
|
| [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ |
|
||||||
| [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ |
|
| [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ |
|
||||||
| [LLaVa-NeXT-Video](model_doc/llava-next-video) | ✅ | ❌ | ❌ |
|
| [LLaVa-NeXT-Video](model_doc/llava_next_video) | ✅ | ❌ | ❌ |
|
||||||
| [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ |
|
| [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ |
|
||||||
| [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ |
|
| [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ |
|
||||||
| [LUKE](model_doc/luke) | ✅ | ❌ | ❌ |
|
| [LUKE](model_doc/luke) | ✅ | ❌ | ❌ |
|
||||||
|
|||||||
@@ -145,8 +145,8 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
|||||||
("lilt", "LiltConfig"),
|
("lilt", "LiltConfig"),
|
||||||
("llama", "LlamaConfig"),
|
("llama", "LlamaConfig"),
|
||||||
("llava", "LlavaConfig"),
|
("llava", "LlavaConfig"),
|
||||||
("llava-next-video", "LlavaNextVideoConfig"),
|
|
||||||
("llava_next", "LlavaNextConfig"),
|
("llava_next", "LlavaNextConfig"),
|
||||||
|
("llava_next_video", "LlavaNextVideoConfig"),
|
||||||
("longformer", "LongformerConfig"),
|
("longformer", "LongformerConfig"),
|
||||||
("longt5", "LongT5Config"),
|
("longt5", "LongT5Config"),
|
||||||
("luke", "LukeConfig"),
|
("luke", "LukeConfig"),
|
||||||
@@ -436,8 +436,8 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
|||||||
("llama2", "Llama2"),
|
("llama2", "Llama2"),
|
||||||
("llama3", "Llama3"),
|
("llama3", "Llama3"),
|
||||||
("llava", "LLaVa"),
|
("llava", "LLaVa"),
|
||||||
("llava-next-video", "LLaVa-NeXT-Video"),
|
|
||||||
("llava_next", "LLaVA-NeXT"),
|
("llava_next", "LLaVA-NeXT"),
|
||||||
|
("llava_next_video", "LLaVa-NeXT-Video"),
|
||||||
("longformer", "Longformer"),
|
("longformer", "Longformer"),
|
||||||
("longt5", "LongT5"),
|
("longt5", "LongT5"),
|
||||||
("luke", "LUKE"),
|
("luke", "LUKE"),
|
||||||
|
|||||||
@@ -97,8 +97,8 @@ else:
|
|||||||
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
|
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
|
||||||
("levit", ("LevitImageProcessor",)),
|
("levit", ("LevitImageProcessor",)),
|
||||||
("llava", ("CLIPImageProcessor",)),
|
("llava", ("CLIPImageProcessor",)),
|
||||||
("llava-next-video", ("LlavaNextVideoImageProcessor",)),
|
|
||||||
("llava_next", ("LlavaNextImageProcessor",)),
|
("llava_next", ("LlavaNextImageProcessor",)),
|
||||||
|
("llava_next_video", ("LlavaNextVideoImageProcessor",)),
|
||||||
("mask2former", ("Mask2FormerImageProcessor",)),
|
("mask2former", ("Mask2FormerImageProcessor",)),
|
||||||
("maskformer", ("MaskFormerImageProcessor",)),
|
("maskformer", ("MaskFormerImageProcessor",)),
|
||||||
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
|
|||||||
@@ -308,8 +308,8 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
|||||||
("idefics2", "Idefics2ForConditionalGeneration"),
|
("idefics2", "Idefics2ForConditionalGeneration"),
|
||||||
("layoutlm", "LayoutLMForMaskedLM"),
|
("layoutlm", "LayoutLMForMaskedLM"),
|
||||||
("llava", "LlavaForConditionalGeneration"),
|
("llava", "LlavaForConditionalGeneration"),
|
||||||
("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
|
|
||||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||||
|
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
|
||||||
("longformer", "LongformerForMaskedLM"),
|
("longformer", "LongformerForMaskedLM"),
|
||||||
("luke", "LukeForMaskedLM"),
|
("luke", "LukeForMaskedLM"),
|
||||||
("lxmert", "LxmertForPreTraining"),
|
("lxmert", "LxmertForPreTraining"),
|
||||||
@@ -721,8 +721,8 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
|
|||||||
("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
|
("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
|
||||||
("kosmos-2", "Kosmos2ForConditionalGeneration"),
|
("kosmos-2", "Kosmos2ForConditionalGeneration"),
|
||||||
("llava", "LlavaForConditionalGeneration"),
|
("llava", "LlavaForConditionalGeneration"),
|
||||||
("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
|
|
||||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||||
|
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
|
||||||
("paligemma", "PaliGemmaForConditionalGeneration"),
|
("paligemma", "PaliGemmaForConditionalGeneration"),
|
||||||
("pix2struct", "Pix2StructForConditionalGeneration"),
|
("pix2struct", "Pix2StructForConditionalGeneration"),
|
||||||
("video_llava", "VideoLlavaForConditionalGeneration"),
|
("video_llava", "VideoLlavaForConditionalGeneration"),
|
||||||
|
|||||||
@@ -71,8 +71,8 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("layoutlmv2", "LayoutLMv2Processor"),
|
("layoutlmv2", "LayoutLMv2Processor"),
|
||||||
("layoutlmv3", "LayoutLMv3Processor"),
|
("layoutlmv3", "LayoutLMv3Processor"),
|
||||||
("llava", "LlavaProcessor"),
|
("llava", "LlavaProcessor"),
|
||||||
("llava-next-video", "LlavaNextVideoProcessor"),
|
|
||||||
("llava_next", "LlavaNextProcessor"),
|
("llava_next", "LlavaNextProcessor"),
|
||||||
|
("llava_next_video", "LlavaNextVideoProcessor"),
|
||||||
("markuplm", "MarkupLMProcessor"),
|
("markuplm", "MarkupLMProcessor"),
|
||||||
("mctct", "MCTCTProcessor"),
|
("mctct", "MCTCTProcessor"),
|
||||||
("mgp-str", "MgpstrProcessor"),
|
("mgp-str", "MgpstrProcessor"),
|
||||||
|
|||||||
@@ -257,8 +257,8 @@ else:
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("llava-next-video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
|
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
(
|
(
|
||||||
"longt5",
|
"longt5",
|
||||||
|
|||||||
Reference in New Issue
Block a user