From 3af425d4c6aa4af1c6dc79c6c1eb185826619f86 Mon Sep 17 00:00:00 2001 From: jp Date: Fri, 28 Mar 2025 18:46:24 +0900 Subject: [PATCH] fix: AttributeError: 'LlavaProcessor' object has no attribute 'image_token_id' (#37026) * Add image_token_id and video_token_id handling in Llava processors * fix: image to video * fix: correct image and video token ID handling in Llava processors * fix: improve image and video token ID handling in Llava processors --- src/transformers/models/llava/processing_llava.py | 5 +++++ .../models/llava_next/processing_llava_next.py | 5 +++++ .../llava_next_video/processing_llava_next_video.py | 10 ++++++++++ .../llava_onevision/processing_llava_onevision.py | 10 ++++++++++ 4 files changed, 30 insertions(+) diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index c477938250..6253e1992f 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -89,6 +89,11 @@ class LlavaProcessor(ProcessorMixin): self.num_additional_image_tokens = num_additional_image_tokens self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 3c2c469e2f..61bb2cdd7f 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -92,6 +92,11 @@ class LlavaNextProcessor(ProcessorMixin): self.num_additional_image_tokens = num_additional_image_tokens self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index ffa7b2cceb..43b4102b96 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -107,6 +107,16 @@ class LlavaNextVideoProcessor(ProcessorMixin): self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) + self.video_token_id = ( + tokenizer.video_token_id + if getattr(tokenizer, "video_token_id", None) + else tokenizer.convert_tokens_to_ids(self.video_token) + ) super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template) def __call__( diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 502a514871..4b1443ab9e 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -100,6 +100,16 @@ class LlavaOnevisionProcessor(ProcessorMixin): self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) + self.video_token_id = ( + tokenizer.video_token_id + if getattr(tokenizer, "video_token_id", None) + else tokenizer.convert_tokens_to_ids(self.video_token) + ) super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) def __call__(