fix: AttributeError: 'LlavaProcessor' object has no attribute 'image_token_id' (#37026)
* Add image_token_id and video_token_id handling in Llava processors * fix: image to video * fix: correct image and video token ID handling in Llava processors * fix: improve image and video token ID handling in Llava processors
This commit is contained in:
@@ -89,6 +89,11 @@ class LlavaProcessor(ProcessorMixin):
|
|||||||
self.num_additional_image_tokens = num_additional_image_tokens
|
self.num_additional_image_tokens = num_additional_image_tokens
|
||||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
|||||||
@@ -92,6 +92,11 @@ class LlavaNextProcessor(ProcessorMixin):
|
|||||||
self.num_additional_image_tokens = num_additional_image_tokens
|
self.num_additional_image_tokens = num_additional_image_tokens
|
||||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
|||||||
@@ -107,6 +107,16 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
|||||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||||
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
|
self.video_token_id = (
|
||||||
|
tokenizer.video_token_id
|
||||||
|
if getattr(tokenizer, "video_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||||
|
)
|
||||||
super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
|||||||
@@ -100,6 +100,16 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
|||||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||||
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
|
self.video_token_id = (
|
||||||
|
tokenizer.video_token_id
|
||||||
|
if getattr(tokenizer, "video_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||||
|
)
|
||||||
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
|||||||
Reference in New Issue
Block a user