Fix AutoConfig and AutoModel support for Llava-Next-Video (#32844)

* Fix: fix all model_type of Llava-Next-Video to llava_next_video

* Fix doc for llava_next_video

* * Fix formatting issues
* Change llava-next-video.md file name into llava_next_video.md to make it compatible with implementation

* Fix docs TOC for llava-next-video
This commit is contained in:
Yangshen⚡Deng
2024-08-16 19:41:05 +08:00
committed by GitHub
parent cf32ee1753
commit a27182b7fc
8 changed files with 9 additions and 9 deletions

View File

@@ -822,7 +822,7 @@
title: Llava title: Llava
- local: model_doc/llava_next - local: model_doc/llava_next
title: LLaVA-NeXT title: LLaVA-NeXT
- local: model_doc/llava-next-video - local: model_doc/llava_next_video
title: LLaVa-NeXT-Video title: LLaVa-NeXT-Video
- local: model_doc/lxmert - local: model_doc/lxmert
title: LXMERT title: LXMERT

View File

@@ -186,7 +186,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ | | [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ |
| [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ | | [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ |
| [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ | | [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ |
| [LLaVa-NeXT-Video](model_doc/llava-next-video) | ✅ | ❌ | ❌ | | [LLaVa-NeXT-Video](model_doc/llava_next_video) | ✅ | ❌ | ❌ |
| [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ | | [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ |
| [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ | | [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ |
| [LUKE](model_doc/luke) | ✅ | ❌ | ❌ | | [LUKE](model_doc/luke) | ✅ | ❌ | ❌ |

View File

@@ -145,8 +145,8 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("lilt", "LiltConfig"), ("lilt", "LiltConfig"),
("llama", "LlamaConfig"), ("llama", "LlamaConfig"),
("llava", "LlavaConfig"), ("llava", "LlavaConfig"),
("llava-next-video", "LlavaNextVideoConfig"),
("llava_next", "LlavaNextConfig"), ("llava_next", "LlavaNextConfig"),
("llava_next_video", "LlavaNextVideoConfig"),
("longformer", "LongformerConfig"), ("longformer", "LongformerConfig"),
("longt5", "LongT5Config"), ("longt5", "LongT5Config"),
("luke", "LukeConfig"), ("luke", "LukeConfig"),
@@ -436,8 +436,8 @@ MODEL_NAMES_MAPPING = OrderedDict(
("llama2", "Llama2"), ("llama2", "Llama2"),
("llama3", "Llama3"), ("llama3", "Llama3"),
("llava", "LLaVa"), ("llava", "LLaVa"),
("llava-next-video", "LLaVa-NeXT-Video"),
("llava_next", "LLaVA-NeXT"), ("llava_next", "LLaVA-NeXT"),
("llava_next_video", "LLaVa-NeXT-Video"),
("longformer", "Longformer"), ("longformer", "Longformer"),
("longt5", "LongT5"), ("longt5", "LongT5"),
("luke", "LUKE"), ("luke", "LUKE"),

View File

@@ -97,8 +97,8 @@ else:
("layoutlmv3", ("LayoutLMv3ImageProcessor",)), ("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
("levit", ("LevitImageProcessor",)), ("levit", ("LevitImageProcessor",)),
("llava", ("CLIPImageProcessor",)), ("llava", ("CLIPImageProcessor",)),
("llava-next-video", ("LlavaNextVideoImageProcessor",)),
("llava_next", ("LlavaNextImageProcessor",)), ("llava_next", ("LlavaNextImageProcessor",)),
("llava_next_video", ("LlavaNextVideoImageProcessor",)),
("mask2former", ("Mask2FormerImageProcessor",)), ("mask2former", ("Mask2FormerImageProcessor",)),
("maskformer", ("MaskFormerImageProcessor",)), ("maskformer", ("MaskFormerImageProcessor",)),
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")), ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),

View File

@@ -308,8 +308,8 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("idefics2", "Idefics2ForConditionalGeneration"), ("idefics2", "Idefics2ForConditionalGeneration"),
("layoutlm", "LayoutLMForMaskedLM"), ("layoutlm", "LayoutLMForMaskedLM"),
("llava", "LlavaForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"),
("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"),
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
("longformer", "LongformerForMaskedLM"), ("longformer", "LongformerForMaskedLM"),
("luke", "LukeForMaskedLM"), ("luke", "LukeForMaskedLM"),
("lxmert", "LxmertForPreTraining"), ("lxmert", "LxmertForPreTraining"),
@@ -721,8 +721,8 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"),
("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"),
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
("paligemma", "PaliGemmaForConditionalGeneration"), ("paligemma", "PaliGemmaForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"),
("video_llava", "VideoLlavaForConditionalGeneration"), ("video_llava", "VideoLlavaForConditionalGeneration"),

View File

@@ -71,8 +71,8 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv2", "LayoutLMv2Processor"),
("layoutlmv3", "LayoutLMv3Processor"), ("layoutlmv3", "LayoutLMv3Processor"),
("llava", "LlavaProcessor"), ("llava", "LlavaProcessor"),
("llava-next-video", "LlavaNextVideoProcessor"),
("llava_next", "LlavaNextProcessor"), ("llava_next", "LlavaNextProcessor"),
("llava_next_video", "LlavaNextVideoProcessor"),
("markuplm", "MarkupLMProcessor"), ("markuplm", "MarkupLMProcessor"),
("mctct", "MCTCTProcessor"), ("mctct", "MCTCTProcessor"),
("mgp-str", "MgpstrProcessor"), ("mgp-str", "MgpstrProcessor"),

View File

@@ -257,8 +257,8 @@ else:
), ),
), ),
("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("llava-next-video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
( (
"longt5", "longt5",