Add fast image processor Janus, Deepseek VL, Deepseek VL hybrid (#39739)

* add fast image processor Janus, deepseek_vl, deepseek_vl_hybrid

* fix after review
This commit is contained in:
Yoni Gozlan
2025-08-01 12:20:08 -04:00
committed by GitHub
parent 88ead3f518
commit 7b4d9843ba
19 changed files with 1268 additions and 97 deletions

View File

@@ -209,6 +209,10 @@ model = DeepseekVLForConditionalGeneration.from_pretrained(
[[autodoc]] DeepseekVLImageProcessor
## DeepseekVLImageProcessorFast
[[autodoc]] DeepseekVLImageProcessorFast
## DeepseekVLModel
[[autodoc]] DeepseekVLModel

View File

@@ -208,6 +208,10 @@ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
[[autodoc]] DeepseekVLHybridImageProcessor
## DeepseekVLHybridImageProcessorFast
[[autodoc]] DeepseekVLHybridImageProcessorFast
## DeepseekVLHybridModel
[[autodoc]] DeepseekVLHybridModel

View File

@@ -44,11 +44,11 @@ Here is the example of visual understanding with a single image.
> Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.
```python
import torch
from PIL import Image
import requests
import torch
from PIL import Image
import requests
from transformers import JanusForConditionalGeneration, JanusProcessor
from transformers import JanusForConditionalGeneration, JanusProcessor
model_id = "deepseek-community/Janus-Pro-1B"
# Prepare Input for generation.
@@ -64,7 +64,7 @@ messages = [
# Set generation mode to `text` to perform text generation.
processor = JanusProcessor.from_pretrained(model_id)
model = JanusForConditionalGeneration.from_pretrained(model_id,
model = JanusForConditionalGeneration.from_pretrained(model_id,
torch_dtype=torch.bfloat16,
device_map="auto")
@@ -209,6 +209,10 @@ for i, image in enumerate(images['pixel_values']):
[[autodoc]] JanusImageProcessor
## JanusImageProcessorFast
[[autodoc]] JanusImageProcessorFast
## JanusVisionModel
[[autodoc]] JanusVisionModel