From 6ba8a1ff4550b4450a22a0b0d907312955ce0fd5 Mon Sep 17 00:00:00 2001 From: Kyle Duffy <155960770+kyle-cohere@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:58:45 +0200 Subject: [PATCH] Update documentation for Cohere2Vision models (#39817) * Update docs with pipeline example * Add Cohere2Vision to list of vision models * Sort models --- docs/source/en/model_doc/cohere2_vision.md | 33 ++++++++++++++++++- src/transformers/models/auto/modeling_auto.py | 1 + 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/cohere2_vision.md b/docs/source/en/model_doc/cohere2_vision.md index 123f9573b9..b0fcddc6d3 100644 --- a/docs/source/en/model_doc/cohere2_vision.md +++ b/docs/source/en/model_doc/cohere2_vision.md @@ -19,9 +19,12 @@ Command A Vision is built upon a robust architecture that leverages the latest a The model and image processor can be loaded as follows: -```python + + +```python import torch + from transformers import AutoProcessor, AutoModelForImageTextToText model_id = "CohereLabs/command-a-vision-07-2025" @@ -68,6 +71,34 @@ print( ) ``` + + + +```python +from transformers import pipeline + +pipe = pipeline(model="CohereLabs/command-a-vision-07-2025", task="image-text-to-text", device_map="auto") + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", + }, + {"type": "text", "text": "Where was this taken ?"}, + ], + }, +] + +outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False) + +print(outputs) +``` + + + ## Cohere2VisionConfig [[autodoc]] Cohere2VisionConfig diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index ea69ed911d..259a297bf6 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -712,6 +712,7 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict( ("aimv2_vision_model", "Aimv2VisionModel"), ("beit", "BeitModel"), ("bit", "BitModel"), + ("cohere2_vision", "Cohere2VisionModel"), ("conditional_detr", "ConditionalDetrModel"), ("convnext", "ConvNextModel"), ("convnextv2", "ConvNextV2Model"),