[vlm] adjust max length for special tokens (#37342)
* update * apply suggestion * fix tests for main branch * remove unused logger * add special tokens in tests * nit * fix more tests * fix test * pg also
This commit is contained in:
committed by
GitHub
parent
c94c59fc47
commit
32eca7197a
@@ -946,6 +946,8 @@ class AriaProcessor(ProcessorMixin):
|
|||||||
size_conversion = {490: 128, 980: 256}
|
size_conversion = {490: 128, 980: 256}
|
||||||
self.size_conversion = {int(k): v for k, v in size_conversion.items()}
|
self.size_conversion = {int(k): v for k, v in size_conversion.items()}
|
||||||
|
|
||||||
|
self.image_token = tokenizer.image_token
|
||||||
|
self.image_token_id = tokenizer.image_token_id
|
||||||
if tokenizer is not None and tokenizer.pad_token is None:
|
if tokenizer is not None and tokenizer.pad_token is None:
|
||||||
tokenizer.pad_token = tokenizer.unk_token
|
tokenizer.pad_token = tokenizer.unk_token
|
||||||
|
|
||||||
@@ -986,10 +988,12 @@ class AriaProcessor(ProcessorMixin):
|
|||||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = [text]
|
text = [text]
|
||||||
elif not isinstance(text, list) and not isinstance(text[0], str):
|
elif not isinstance(text, list) and not isinstance(text[0], str):
|
||||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_inputs = self.image_processor(
|
image_inputs = self.image_processor(
|
||||||
images,
|
images,
|
||||||
@@ -1007,12 +1011,11 @@ class AriaProcessor(ProcessorMixin):
|
|||||||
image_inputs = {}
|
image_inputs = {}
|
||||||
prompt_strings = text
|
prompt_strings = text
|
||||||
|
|
||||||
text_inputs = self.tokenizer(
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
prompt_strings,
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
**output_kwargs["text_kwargs"],
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
)
|
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -72,6 +72,8 @@ class AriaProcessor(ProcessorMixin):
|
|||||||
size_conversion = {490: 128, 980: 256}
|
size_conversion = {490: 128, 980: 256}
|
||||||
self.size_conversion = {int(k): v for k, v in size_conversion.items()}
|
self.size_conversion = {int(k): v for k, v in size_conversion.items()}
|
||||||
|
|
||||||
|
self.image_token = tokenizer.image_token
|
||||||
|
self.image_token_id = tokenizer.image_token_id
|
||||||
if tokenizer is not None and tokenizer.pad_token is None:
|
if tokenizer is not None and tokenizer.pad_token is None:
|
||||||
tokenizer.pad_token = tokenizer.unk_token
|
tokenizer.pad_token = tokenizer.unk_token
|
||||||
|
|
||||||
@@ -112,10 +114,12 @@ class AriaProcessor(ProcessorMixin):
|
|||||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = [text]
|
text = [text]
|
||||||
elif not isinstance(text, list) and not isinstance(text[0], str):
|
elif not isinstance(text, list) and not isinstance(text[0], str):
|
||||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_inputs = self.image_processor(
|
image_inputs = self.image_processor(
|
||||||
images,
|
images,
|
||||||
@@ -133,12 +137,11 @@ class AriaProcessor(ProcessorMixin):
|
|||||||
image_inputs = {}
|
image_inputs = {}
|
||||||
prompt_strings = text
|
prompt_strings = text
|
||||||
|
|
||||||
text_inputs = self.tokenizer(
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
prompt_strings,
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
**output_kwargs["text_kwargs"],
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
)
|
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -121,6 +121,7 @@ class AyaVisionProcessor(ProcessorMixin):
|
|||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
self.image_token = image_token
|
self.image_token = image_token
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
self.patch_size = patch_size * downsample_factor
|
self.patch_size = patch_size * downsample_factor
|
||||||
self.img_size = img_size
|
self.img_size = img_size
|
||||||
|
|
||||||
@@ -224,9 +225,11 @@ class AyaVisionProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
text = processed_text
|
text = processed_text
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
|
def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
|
||||||
self.image_seq_length = image_seq_length
|
self.image_seq_length = image_seq_length
|
||||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
self.image_start_token = (
|
self.image_start_token = (
|
||||||
tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
|
tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
|
||||||
) # fixed tokens for start and end, so can hardcode
|
) # fixed tokens for start and end, so can hardcode
|
||||||
@@ -140,12 +141,14 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
|
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(prompt_strings, data, modalities=["image"])
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
|
data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ class Emu3Processor(ProcessorMixin):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.image_token = tokenizer.image_token # image_token as placeholder to be replaced by vq-vae tokens
|
self.image_token = tokenizer.image_token # image_token as placeholder to be replaced by vq-vae tokens
|
||||||
|
self.image_token_id = tokenizer.image_token_id
|
||||||
self.image_start_token = tokenizer.boi_token # "<|image start|>" fixed tokens for start and end of image
|
self.image_start_token = tokenizer.boi_token # "<|image start|>" fixed tokens for start and end of image
|
||||||
self.image_end_token = tokenizer.eoi_token # "<|image end|>"
|
self.image_end_token = tokenizer.eoi_token # "<|image end|>"
|
||||||
self.fake_token_around_image = tokenizer.image_wrapper_token # "<|image token|>" every image starts with it
|
self.fake_token_around_image = tokenizer.image_wrapper_token # "<|image token|>" every image starts with it
|
||||||
@@ -177,10 +178,13 @@ class Emu3Processor(ProcessorMixin):
|
|||||||
image_features["image_sizes"] = [[height, width]] * len(text)
|
image_features["image_sizes"] = [[height, width]] * len(text)
|
||||||
|
|
||||||
# else just generate from text-only input, and we do no special treatment for text
|
# else just generate from text-only input, and we do no special treatment for text
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
data = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
data = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, data, modalities=["image"])
|
||||||
|
|
||||||
data.update(**image_features)
|
data.update(**image_features)
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None))
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
def calculate_generate_size(self, ratio, image_area, spatial_factor):
|
def calculate_generate_size(self, ratio, image_area, spatial_factor):
|
||||||
width, height = map(int, ratio.split(":"))
|
width, height = map(int, ratio.split(":"))
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ class Gemma3Processor(ProcessorMixin):
|
|||||||
self.image_seq_length = image_seq_length
|
self.image_seq_length = image_seq_length
|
||||||
self.image_token_id = tokenizer.image_token_id
|
self.image_token_id = tokenizer.image_token_id
|
||||||
self.boi_token = tokenizer.boi_token
|
self.boi_token = tokenizer.boi_token
|
||||||
self.image_token = tokenizer.boi_token
|
self.image_token = tokenizer.image_token
|
||||||
image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
|
image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
|
||||||
self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
|
self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
|
||||||
|
|
||||||
@@ -138,6 +138,7 @@ class Gemma3Processor(ProcessorMixin):
|
|||||||
|
|
||||||
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
|
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
# Add token type ids manually, as tokenizer can't do arbitrary position token types
|
# Add token type ids manually, as tokenizer can't do arbitrary position token types
|
||||||
array_ids = text_inputs["input_ids"]
|
array_ids = text_inputs["input_ids"]
|
||||||
|
|||||||
@@ -107,6 +107,8 @@ class GotOcr2Processor(ProcessorMixin):
|
|||||||
self.img_start_token = "<img>"
|
self.img_start_token = "<img>"
|
||||||
self.img_end_token = "</img>"
|
self.img_end_token = "</img>"
|
||||||
self.img_pad_token = "<imgpad>"
|
self.img_pad_token = "<imgpad>"
|
||||||
|
self.image_token = "<imgpad>" # keep the above for BC, but we need to call it `image_token`
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
|
self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
|
||||||
|
|
||||||
def _make_list_of_inputs(self, images, text, box, color, multi_page):
|
def _make_list_of_inputs(self, images, text, box, color, multi_page):
|
||||||
@@ -250,8 +252,11 @@ class GotOcr2Processor(ProcessorMixin):
|
|||||||
)
|
)
|
||||||
text.append(prompt)
|
text.append(prompt)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -98,13 +98,15 @@ class Idefics2Processor(ProcessorMixin):
|
|||||||
raise ValueError("You need to specify a `tokenizer`.")
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
if not hasattr(tokenizer, "image_token"):
|
if not hasattr(tokenizer, "image_token"):
|
||||||
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
|
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
|
||||||
self.image_token = AddedToken("<image>", normalized=False, special=True)
|
self.image_token = AddedToken("<image>", normalized=False, special=True).content
|
||||||
tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]}
|
tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]}
|
||||||
tokenizer.add_special_tokens(tokens_to_add)
|
tokenizer.add_special_tokens(tokens_to_add)
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
else:
|
else:
|
||||||
self.fake_image_token = tokenizer.image_boundary_token
|
self.fake_image_token = tokenizer.image_boundary_token
|
||||||
self.image_token = tokenizer.image_token
|
self.image_token = tokenizer.image_token
|
||||||
|
self.image_token_id = tokenizer.image_token_id
|
||||||
|
|
||||||
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
|
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
|
||||||
tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]})
|
tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]})
|
||||||
@@ -190,9 +192,10 @@ class Idefics2Processor(ProcessorMixin):
|
|||||||
)
|
)
|
||||||
image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
|
image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
|
||||||
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
|
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
|
|
||||||
n_images_in_text = []
|
n_images_in_text = []
|
||||||
inputs = BatchFeature()
|
inputs = {}
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
@@ -201,13 +204,14 @@ class Idefics2Processor(ProcessorMixin):
|
|||||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
|
|
||||||
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
|
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
|
||||||
fake_image_token = self.fake_image_token.content
|
fake_image_token = self.fake_image_token
|
||||||
image_token = self.image_token.content
|
image_token = self.image_token
|
||||||
image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
|
image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
|
||||||
|
|
||||||
if self.image_processor.do_image_splitting:
|
if self.image_processor.do_image_splitting:
|
||||||
# A single image token is split into 4 patches + 1 original image
|
# A single image token is split into 4 patches + 1 original image
|
||||||
image_str = image_str * 5
|
image_str = image_str * 5
|
||||||
|
image_seq_len *= 5
|
||||||
|
|
||||||
prompt_strings = []
|
prompt_strings = []
|
||||||
for sample in text:
|
for sample in text:
|
||||||
@@ -218,6 +222,7 @@ class Idefics2Processor(ProcessorMixin):
|
|||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
|
|
||||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
inputs.update(text_inputs)
|
inputs.update(text_inputs)
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
@@ -259,7 +264,7 @@ class Idefics2Processor(ProcessorMixin):
|
|||||||
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||||
inputs.update(image_inputs)
|
inputs.update(image_inputs)
|
||||||
|
|
||||||
return inputs
|
return BatchFeature(inputs, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -141,9 +141,9 @@ class Idefics3Processor(ProcessorMixin):
|
|||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
raise ValueError("You need to specify a `tokenizer`.")
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
|
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
|
||||||
self.image_token = AddedToken("<image>", normalized=False, special=True)
|
self.image_token = AddedToken("<image>", normalized=False, special=True).content
|
||||||
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
|
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
|
||||||
self.global_image_tag = "<global-img>" # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341
|
self.global_image_tag = "<global-img>" # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341
|
||||||
self.image_seq_len = image_seq_len
|
self.image_seq_len = image_seq_len
|
||||||
|
|
||||||
@@ -159,6 +159,7 @@ class Idefics3Processor(ProcessorMixin):
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
tokenizer.add_special_tokens(tokens_to_add)
|
tokenizer.add_special_tokens(tokens_to_add)
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
|
||||||
|
|
||||||
@@ -240,17 +241,18 @@ class Idefics3Processor(ProcessorMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
|
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
|
|
||||||
n_images_in_text = []
|
n_images_in_text = []
|
||||||
n_images_in_images = []
|
n_images_in_images = []
|
||||||
inputs = BatchFeature()
|
inputs = {}
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = [text]
|
text = [text]
|
||||||
elif not isinstance(text, list) and not isinstance(text[0], str):
|
elif not isinstance(text, list) and not isinstance(text[0], str):
|
||||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
n_images_in_text = [sample.count(self.image_token.content) for sample in text]
|
n_images_in_text = [sample.count(self.image_token) for sample in text]
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
if is_image_or_image_url(images):
|
if is_image_or_image_url(images):
|
||||||
@@ -259,8 +261,8 @@ class Idefics3Processor(ProcessorMixin):
|
|||||||
if text is not None:
|
if text is not None:
|
||||||
if sum(n_images_in_text) != len(images):
|
if sum(n_images_in_text) != len(images):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed."
|
f"The total number of {self.image_token} tokens in the prompts should be the same as the number of images passed."
|
||||||
f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images."
|
f" Found {sum(n_images_in_text)} {self.image_token} tokens and {len(images)} images."
|
||||||
)
|
)
|
||||||
# Reorganize the images to match the prompts
|
# Reorganize the images to match the prompts
|
||||||
cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
|
cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
|
||||||
@@ -295,8 +297,8 @@ class Idefics3Processor(ProcessorMixin):
|
|||||||
image_rows = inputs.pop("rows", [[0] * len(text)])
|
image_rows = inputs.pop("rows", [[0] * len(text)])
|
||||||
image_cols = inputs.pop("cols", [[0] * len(text)])
|
image_cols = inputs.pop("cols", [[0] * len(text)])
|
||||||
|
|
||||||
fake_image_token = self.fake_image_token.content
|
fake_image_token = self.fake_image_token
|
||||||
image_token = self.image_token.content
|
image_token = self.image_token
|
||||||
global_img_token = self.global_image_tag
|
global_img_token = self.global_image_tag
|
||||||
|
|
||||||
prompt_strings = []
|
prompt_strings = []
|
||||||
@@ -324,18 +326,19 @@ class Idefics3Processor(ProcessorMixin):
|
|||||||
sample += image_prompt_string + split_sample[i + 1]
|
sample += image_prompt_string + split_sample[i + 1]
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
|
|
||||||
text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
inputs.update(text_inputs)
|
inputs.update(text_inputs)
|
||||||
|
|
||||||
elif text is not None:
|
elif text is not None:
|
||||||
if any(n_images_in_text):
|
if any(n_images_in_text):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed."
|
f"Found {sum(n_images_in_text)} {self.image_token} tokens in the text but no images were passed."
|
||||||
)
|
)
|
||||||
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
|
||||||
inputs.update(text_inputs)
|
inputs.update(text_inputs)
|
||||||
|
|
||||||
return inputs
|
return BatchFeature(inputs, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -122,6 +122,7 @@ class Llama4Processor(ProcessorMixin):
|
|||||||
|
|
||||||
self.fake_image_token = fake_image_token
|
self.fake_image_token = fake_image_token
|
||||||
self.image_token = image_token
|
self.image_token = image_token
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
self.start_of_img_token = start_of_image_token
|
self.start_of_img_token = start_of_image_token
|
||||||
self.end_of_img_token = end_of_image_token
|
self.end_of_img_token = end_of_image_token
|
||||||
self.img_patch_token = patch_token
|
self.img_patch_token = patch_token
|
||||||
@@ -148,6 +149,7 @@ class Llama4Processor(ProcessorMixin):
|
|||||||
img_string += "<|tile_x_separator|>"
|
img_string += "<|tile_x_separator|>"
|
||||||
|
|
||||||
img_string += "<|tile_y_separator|>"
|
img_string += "<|tile_y_separator|>"
|
||||||
|
|
||||||
img_string += "<|image|>"
|
img_string += "<|image|>"
|
||||||
img_string += "<|patch|>" * num_patches_per_chunk
|
img_string += "<|patch|>" * num_patches_per_chunk
|
||||||
img_string += "<|image_end|>"
|
img_string += "<|image_end|>"
|
||||||
@@ -247,9 +249,11 @@ class Llama4Processor(ProcessorMixin):
|
|||||||
|
|
||||||
text = processed_text
|
text = processed_text
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -173,8 +173,10 @@ class LlavaProcessor(ProcessorMixin):
|
|||||||
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -171,9 +171,11 @@ class LlavaNextProcessor(ProcessorMixin):
|
|||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
|
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
||||||
image_grid_pinpoints = self.image_processor.image_grid_pinpoints
|
image_grid_pinpoints = self.image_processor.image_grid_pinpoints
|
||||||
|
|||||||
@@ -224,8 +224,11 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
|||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
text = prompt_strings
|
text = prompt_strings
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||||
|
|
||||||
|
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
# Copied from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_number_of_features
|
# Copied from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_number_of_features
|
||||||
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
|||||||
to_numpy_array(image_inputs["pixel_values"][0][0]),
|
to_numpy_array(image_inputs["pixel_values"][0][0]),
|
||||||
channel_dim=output_kwargs["images_kwargs"].get("data_format"),
|
channel_dim=output_kwargs["images_kwargs"].get("data_format"),
|
||||||
)
|
)
|
||||||
text = self._expand_image_tokens(text, image_sizes, height, width, self.image_token)
|
text, num_image_tokens = self._expand_image_tokens(text, image_sizes, height, width, self.image_token)
|
||||||
|
|
||||||
if videos is not None:
|
if videos is not None:
|
||||||
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
|
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
|
||||||
@@ -188,8 +188,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
|||||||
num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
|
num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
|
||||||
text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
|
text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
|
return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def _expand_image_tokens(
|
def _expand_image_tokens(
|
||||||
self,
|
self,
|
||||||
@@ -201,6 +204,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
|||||||
num_frames: int = 1,
|
num_frames: int = 1,
|
||||||
):
|
):
|
||||||
prompt_strings = []
|
prompt_strings = []
|
||||||
|
max_num_vision_tokens = 0
|
||||||
for sample in text:
|
for sample in text:
|
||||||
while special_token in sample:
|
while special_token in sample:
|
||||||
image_size_list = next(image_sizes)
|
image_size_list = next(image_sizes)
|
||||||
@@ -210,12 +214,13 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
|||||||
original_size = original_size.tolist()
|
original_size = original_size.tolist()
|
||||||
orig_height, orig_width = original_size
|
orig_height, orig_width = original_size
|
||||||
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
||||||
|
max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
|
||||||
if self.vision_feature_select_strategy == "default":
|
if self.vision_feature_select_strategy == "default":
|
||||||
num_image_tokens -= 1
|
num_image_tokens -= 1
|
||||||
sample = sample.replace(special_token, "<placeholder>" * num_image_tokens * num_frames, 1)
|
sample = sample.replace(special_token, "<placeholder>" * num_image_tokens * num_frames, 1)
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
|
text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
|
||||||
return text
|
return text, max_num_vision_tokens
|
||||||
|
|
||||||
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
||||||
image_grid_pinpoints = self.image_processor.image_grid_pinpoints
|
image_grid_pinpoints = self.image_processor.image_grid_pinpoints
|
||||||
|
|||||||
@@ -274,6 +274,7 @@ class MllamaProcessor(ProcessorMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
text_kwargs = output_kwargs["text_kwargs"]
|
text_kwargs = output_kwargs["text_kwargs"]
|
||||||
|
text_kwargs["return_tensors"] = None
|
||||||
images_kwargs = output_kwargs["images_kwargs"]
|
images_kwargs = output_kwargs["images_kwargs"]
|
||||||
common_kwargs = output_kwargs["common_kwargs"]
|
common_kwargs = output_kwargs["common_kwargs"]
|
||||||
|
|
||||||
@@ -287,6 +288,8 @@ class MllamaProcessor(ProcessorMixin):
|
|||||||
text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
|
text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
|
||||||
_ = text_kwargs.pop("padding_side", None) # hack until padding-side is an accepted kwarg by tokenizers
|
_ = text_kwargs.pop("padding_side", None) # hack until padding-side is an accepted kwarg by tokenizers
|
||||||
encoding = self.tokenizer(text, **text_kwargs)
|
encoding = self.tokenizer(text, **text_kwargs)
|
||||||
|
self._check_special_mm_tokens(text, encoding, modalities=["image"])
|
||||||
|
n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]]
|
||||||
data.update(encoding)
|
data.update(encoding)
|
||||||
|
|
||||||
n_images_in_images = [0]
|
n_images_in_images = [0]
|
||||||
@@ -301,13 +304,18 @@ class MllamaProcessor(ProcessorMixin):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"If a batch of text is provided, there should be either no images or at least one image per sample"
|
"If a batch of text is provided, there should be either no images or at least one image per sample"
|
||||||
)
|
)
|
||||||
if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text:
|
if sum(n_images_in_text) > 0 and (
|
||||||
|
n_images_in_images != n_images_in_text or n_images_in_ids != n_images_in_images
|
||||||
|
):
|
||||||
if images is None:
|
if images is None:
|
||||||
raise ValueError("No image were provided, but there are image tokens in the prompt")
|
raise ValueError("No image were provided, but there are image tokens in the prompt")
|
||||||
else:
|
else:
|
||||||
add_message = ""
|
add_message = ""
|
||||||
if sum(n_images_in_images) == sum(n_images_in_text):
|
if sum(n_images_in_images) == sum(n_images_in_text) and n_images_in_images != n_images_in_text:
|
||||||
add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch"
|
add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch"
|
||||||
|
elif n_images_in_ids != n_images_in_images:
|
||||||
|
add_message = "If you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped."
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The number of image tokens in each text ({n_images_in_text}) should be the same as the "
|
f"The number of image tokens in each text ({n_images_in_text}) should be the same as the "
|
||||||
f"number of provided images per batch ({n_images_in_images}). {add_message}"
|
f"number of provided images per batch ({n_images_in_images}). {add_message}"
|
||||||
|
|||||||
@@ -298,23 +298,21 @@ class PaliGemmaProcessor(ProcessorMixin):
|
|||||||
suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
|
suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
|
||||||
pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
|
pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
|
||||||
|
|
||||||
# max_length has to account for the image tokens
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
if output_kwargs["text_kwargs"].get("max_length", None) is not None:
|
|
||||||
output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
|
|
||||||
|
|
||||||
inputs = self.tokenizer(
|
inputs = self.tokenizer(
|
||||||
input_strings,
|
input_strings,
|
||||||
text_pair=suffix,
|
text_pair=suffix,
|
||||||
return_token_type_ids=return_token_type_ids,
|
return_token_type_ids=return_token_type_ids,
|
||||||
**output_kwargs["text_kwargs"],
|
**output_kwargs["text_kwargs"],
|
||||||
)
|
)
|
||||||
|
self._check_special_mm_tokens(input_strings, inputs, modalities=["image"])
|
||||||
|
|
||||||
return_data = {**inputs, "pixel_values": pixel_values}
|
return_data = {**inputs, "pixel_values": pixel_values}
|
||||||
|
|
||||||
if return_token_type_ids:
|
if return_token_type_ids:
|
||||||
labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
|
labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
|
||||||
return_data.update({"labels": labels})
|
return_data.update({"labels": labels})
|
||||||
return BatchFeature(data=return_data)
|
return BatchFeature(data=return_data, tensor_type=return_tensors)
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -71,6 +71,10 @@ class Phi4MultimodalProcessor(ProcessorMixin):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
|
self.image_token = tokenizer.image_token
|
||||||
|
self.image_token_id = tokenizer.image_token_id
|
||||||
|
self.audio_token = tokenizer.audio_token
|
||||||
|
self.audio_token_id = tokenizer.audio_token_id
|
||||||
super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
|
super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
@@ -113,7 +117,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
|
|||||||
output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
|
output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
|
||||||
image_kwargs = output_kwargs["images_kwargs"]
|
image_kwargs = output_kwargs["images_kwargs"]
|
||||||
audio_kwargs = output_kwargs["audio_kwargs"]
|
audio_kwargs = output_kwargs["audio_kwargs"]
|
||||||
text_kwargs = output_kwargs["text_kwargs"]
|
|
||||||
|
|
||||||
image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
|
image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
|
||||||
audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
|
audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
|
||||||
@@ -154,7 +157,9 @@ class Phi4MultimodalProcessor(ProcessorMixin):
|
|||||||
re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
|
re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
|
||||||
]
|
]
|
||||||
|
|
||||||
text_inputs = self.tokenizer(processed_text, **text_kwargs)
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
|
text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
|
||||||
|
|
||||||
# prepare batch feature
|
# prepare batch feature
|
||||||
data = {
|
data = {
|
||||||
@@ -163,7 +168,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
|
|||||||
**audio_inputs,
|
**audio_inputs,
|
||||||
}
|
}
|
||||||
|
|
||||||
return BatchFeature(data=data)
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -103,6 +103,7 @@ class PixtralProcessor(ProcessorMixin):
|
|||||||
self.patch_size = patch_size
|
self.patch_size = patch_size
|
||||||
self.spatial_merge_size = spatial_merge_size
|
self.spatial_merge_size = spatial_merge_size
|
||||||
self.image_token = image_token
|
self.image_token = image_token
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
self.image_break_token = image_break_token
|
self.image_break_token = image_break_token
|
||||||
self.image_end_token = image_end_token
|
self.image_end_token = image_end_token
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
@@ -211,10 +212,10 @@ class PixtralProcessor(ProcessorMixin):
|
|||||||
sample = sample.replace("<placeholder>", replace_str, 1)
|
sample = sample.replace("<placeholder>", replace_str, 1)
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
return BatchFeature(
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
||||||
data={**text_inputs, **image_inputs}, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
||||||
)
|
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -941,16 +941,14 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
|
|||||||
if not isinstance(text, list):
|
if not isinstance(text, list):
|
||||||
text = [text]
|
text = [text]
|
||||||
|
|
||||||
|
text = text.copy() # below lines change text in-place
|
||||||
if image_grid_thw is not None:
|
if image_grid_thw is not None:
|
||||||
merge_length = self.image_processor.merge_size**2
|
merge_length = self.image_processor.merge_size**2
|
||||||
index = 0
|
index = 0
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
while self.image_token in text[i]:
|
while self.image_token in text[i]:
|
||||||
text[i] = text[i].replace(
|
num_image_tokens = image_grid_thw[index].prod() // merge_length
|
||||||
self.image_token,
|
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||||
"<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
|
|
||||||
1,
|
|
||||||
)
|
|
||||||
index += 1
|
index += 1
|
||||||
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||||
|
|
||||||
@@ -959,17 +957,16 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
|
|||||||
index = 0
|
index = 0
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
while self.video_token in text[i]:
|
while self.video_token in text[i]:
|
||||||
text[i] = text[i].replace(
|
num_video_tokens = video_grid_thw[index].prod() // merge_length
|
||||||
self.video_token,
|
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
|
||||||
"<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
|
|
||||||
1,
|
|
||||||
)
|
|
||||||
index += 1
|
index += 1
|
||||||
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|||||||
@@ -77,6 +77,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
||||||
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
||||||
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
|
self.video_token_id = (
|
||||||
|
tokenizer.video_token_id
|
||||||
|
if getattr(tokenizer, "video_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||||
|
)
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
@@ -157,16 +167,14 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
|
|||||||
if not isinstance(text, list):
|
if not isinstance(text, list):
|
||||||
text = [text]
|
text = [text]
|
||||||
|
|
||||||
|
text = text.copy() # below lines change text in-place
|
||||||
if image_grid_thw is not None:
|
if image_grid_thw is not None:
|
||||||
merge_length = self.image_processor.merge_size**2
|
merge_length = self.image_processor.merge_size**2
|
||||||
index = 0
|
index = 0
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
while self.image_token in text[i]:
|
while self.image_token in text[i]:
|
||||||
text[i] = text[i].replace(
|
num_image_tokens = image_grid_thw[index].prod() // merge_length
|
||||||
self.image_token,
|
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||||
"<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
|
|
||||||
1,
|
|
||||||
)
|
|
||||||
index += 1
|
index += 1
|
||||||
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||||
|
|
||||||
@@ -175,17 +183,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
|
|||||||
index = 0
|
index = 0
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
while self.video_token in text[i]:
|
while self.video_token in text[i]:
|
||||||
text[i] = text[i].replace(
|
num_video_tokens = video_grid_thw[index].prod() // merge_length
|
||||||
self.video_token,
|
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
|
||||||
"<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
|
|
||||||
1,
|
|
||||||
)
|
|
||||||
index += 1
|
index += 1
|
||||||
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||||
|
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
|
|||||||
if chat_template is None:
|
if chat_template is None:
|
||||||
chat_template = self.default_chat_template
|
chat_template = self.default_chat_template
|
||||||
self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
|
self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
|
||||||
|
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
|
||||||
self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
|
self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
|
||||||
self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
|
self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
|
||||||
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
||||||
@@ -179,12 +180,14 @@ class Qwen2AudioProcessor(ProcessorMixin):
|
|||||||
expanded_text.append(sample)
|
expanded_text.append(sample)
|
||||||
text = expanded_text
|
text = expanded_text
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, inputs, modalities=["audio"])
|
||||||
|
|
||||||
if audio is not None:
|
if audio is not None:
|
||||||
inputs.update(audio_inputs)
|
inputs.update(audio_inputs)
|
||||||
|
|
||||||
return BatchFeature(data={**inputs})
|
return BatchFeature(data={**inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -72,6 +72,16 @@ class Qwen2VLProcessor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
||||||
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
||||||
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
||||||
|
self.image_token_id = (
|
||||||
|
tokenizer.image_token_id
|
||||||
|
if getattr(tokenizer, "image_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
)
|
||||||
|
self.video_token_id = (
|
||||||
|
tokenizer.video_token_id
|
||||||
|
if getattr(tokenizer, "video_token_id", None)
|
||||||
|
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||||
|
)
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
@@ -139,14 +149,15 @@ class Qwen2VLProcessor(ProcessorMixin):
|
|||||||
if not isinstance(text, list):
|
if not isinstance(text, list):
|
||||||
text = [text]
|
text = [text]
|
||||||
|
|
||||||
|
text = text.copy() # below lines change text in-place
|
||||||
|
|
||||||
if image_grid_thw is not None:
|
if image_grid_thw is not None:
|
||||||
merge_length = self.image_processor.merge_size**2
|
merge_length = self.image_processor.merge_size**2
|
||||||
index = 0
|
index = 0
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
while self.image_token in text[i]:
|
while self.image_token in text[i]:
|
||||||
text[i] = text[i].replace(
|
num_image_tokens = image_grid_thw[index].prod() // merge_length
|
||||||
self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
|
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
||||||
)
|
|
||||||
index += 1
|
index += 1
|
||||||
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
||||||
|
|
||||||
@@ -155,15 +166,15 @@ class Qwen2VLProcessor(ProcessorMixin):
|
|||||||
index = 0
|
index = 0
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
while self.video_token in text[i]:
|
while self.video_token in text[i]:
|
||||||
text[i] = text[i].replace(
|
num_video_tokens = video_grid_thw[index].prod() // merge_length
|
||||||
self.video_token, "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1
|
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
|
||||||
)
|
|
||||||
index += 1
|
index += 1
|
||||||
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -149,6 +149,7 @@ class SmolVLMProcessor(ProcessorMixin):
|
|||||||
):
|
):
|
||||||
self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
|
self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
|
||||||
self.image_token = getattr(tokenizer, "image_token", "<image>")
|
self.image_token = getattr(tokenizer, "image_token", "<image>")
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
|
self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
|
||||||
self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
|
self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
|
||||||
self.image_seq_len = image_seq_len
|
self.image_seq_len = image_seq_len
|
||||||
@@ -290,7 +291,7 @@ class SmolVLMProcessor(ProcessorMixin):
|
|||||||
if n_images_in_text > 0 and (images is None and videos is None):
|
if n_images_in_text > 0 and (images is None and videos is None):
|
||||||
raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed")
|
raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed")
|
||||||
|
|
||||||
inputs = BatchFeature()
|
inputs = {}
|
||||||
# Images and videos are mutually exclusive, so process one which is present
|
# Images and videos are mutually exclusive, so process one which is present
|
||||||
if images is not None:
|
if images is not None:
|
||||||
images = make_nested_list_of_images(images)
|
images = make_nested_list_of_images(images)
|
||||||
@@ -313,11 +314,14 @@ class SmolVLMProcessor(ProcessorMixin):
|
|||||||
)
|
)
|
||||||
inputs.update(vision_inputs)
|
inputs.update(vision_inputs)
|
||||||
|
|
||||||
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||||
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||||
inputs.update(text_inputs)
|
inputs.update(text_inputs)
|
||||||
|
|
||||||
return inputs
|
return BatchFeature(inputs, tensor_type=return_tensors)
|
||||||
|
|
||||||
def _process_messages_for_chat_template(
|
def _process_messages_for_chat_template(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -87,6 +87,8 @@ class VideoLlavaProcessor(ProcessorMixin):
|
|||||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||||
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
||||||
|
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
||||||
|
self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token)
|
||||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
@@ -195,14 +197,16 @@ class VideoLlavaProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
text_inputs = self.tokenizer(
|
text_inputs = self.tokenizer(
|
||||||
prompt_strings,
|
prompt_strings,
|
||||||
return_tensors=return_tensors,
|
return_tensors=None,
|
||||||
padding=padding,
|
padding=padding,
|
||||||
truncation=truncation,
|
truncation=truncation,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
)
|
)
|
||||||
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
|
||||||
|
|
||||||
data.update(text_inputs)
|
data.update(text_inputs)
|
||||||
|
|
||||||
return BatchFeature(data=data)
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ from huggingface_hub.errors import EntryNotFoundError
|
|||||||
|
|
||||||
from .audio_utils import load_audio
|
from .audio_utils import load_audio
|
||||||
from .dynamic_module_utils import custom_object_save
|
from .dynamic_module_utils import custom_object_save
|
||||||
|
from .feature_extraction_utils import BatchFeature
|
||||||
from .image_utils import (
|
from .image_utils import (
|
||||||
ChannelDimension,
|
ChannelDimension,
|
||||||
ImageInput,
|
ImageInput,
|
||||||
@@ -1615,6 +1616,23 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
"""
|
"""
|
||||||
return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
|
return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
|
||||||
|
|
||||||
|
def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
|
||||||
|
"""
|
||||||
|
Checks that number of special tokens in text and processed text is same. The count can be different
|
||||||
|
if tokenized text was truncated, leading to issues in model code.
|
||||||
|
"""
|
||||||
|
for modality in modalities:
|
||||||
|
token_str = getattr(self, f"{modality}_token")
|
||||||
|
token_id = getattr(self, f"{modality}_token_id")
|
||||||
|
ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
|
||||||
|
text_count = [sample.count(token_str) for sample in text]
|
||||||
|
|
||||||
|
if ids_count != text_count:
|
||||||
|
raise ValueError(
|
||||||
|
f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
|
||||||
|
"Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _validate_images_text_input_order(images, text):
|
def _validate_images_text_input_order(images, text):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -271,3 +271,29 @@ And who is that?<|im_end|>
|
|||||||
return_tensors="np",
|
return_tensors="np",
|
||||||
)
|
)
|
||||||
self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
|
self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=3,
|
||||||
|
)
|
||||||
|
|||||||
@@ -40,10 +40,37 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
|
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
|
||||||
tokenizer.pad_token_id = 0
|
tokenizer.pad_token_id = 0
|
||||||
tokenizer.sep_token_id = 1
|
tokenizer.sep_token_id = 1
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||||
processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
|
processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
|
||||||
processor.save_pretrained(cls.tmpdirname)
|
processor.save_pretrained(cls.tmpdirname)
|
||||||
cls.image_token = processor.image_token
|
cls.image_token = processor.image_token
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=20,
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def prepare_processor_dict():
|
def prepare_processor_dict():
|
||||||
return {"image_seq_length": 2} # fmt: skip
|
return {"image_seq_length": 2} # fmt: skip
|
||||||
|
|||||||
@@ -124,3 +124,28 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
# base image + 4 crops
|
# base image + 4 crops
|
||||||
self.assertEqual(len(inputs[self.images_input_name]), 5)
|
self.assertEqual(len(inputs[self.images_input_name]), 5)
|
||||||
self.assertEqual(len(inputs[self.text_input_name][0]), 67)
|
self.assertEqual(len(inputs[self.text_input_name][0]), 67)
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=5,
|
||||||
|
)
|
||||||
|
|||||||
@@ -66,8 +66,8 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
cls.bos_token = processor.tokenizer.bos_token
|
cls.bos_token = processor.tokenizer.bos_token
|
||||||
cls.image_token = processor.image_token.content
|
cls.image_token = processor.image_token
|
||||||
cls.fake_image_token = processor.fake_image_token.content
|
cls.fake_image_token = processor.fake_image_token
|
||||||
|
|
||||||
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
|
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
|
||||||
cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
|
cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
|
||||||
|
|||||||
@@ -60,8 +60,8 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
cls.bos_token = processor.tokenizer.bos_token
|
cls.bos_token = processor.tokenizer.bos_token
|
||||||
cls.image_token = processor.image_token.content
|
cls.image_token = processor.image_token
|
||||||
cls.fake_image_token = processor.fake_image_token.content
|
cls.fake_image_token = processor.fake_image_token
|
||||||
cls.global_img_token = processor.global_image_tag
|
cls.global_img_token = processor.global_image_tag
|
||||||
|
|
||||||
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
|
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_processor = CLIPImageProcessor(do_center_crop=False)
|
image_processor = CLIPImageProcessor(do_center_crop=False)
|
||||||
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||||
processor_kwargs = cls.prepare_processor_dict()
|
processor_kwargs = cls.prepare_processor_dict()
|
||||||
processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
|
processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
|
||||||
processor.save_pretrained(cls.tmpdirname)
|
processor.save_pretrained(cls.tmpdirname)
|
||||||
@@ -79,3 +80,29 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
processor = LlavaProcessor.from_pretrained(checkpoint)
|
processor = LlavaProcessor.from_pretrained(checkpoint)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
|
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=5,
|
||||||
|
)
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image_processor = LlavaNextImageProcessor()
|
image_processor = LlavaNextImageProcessor()
|
||||||
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||||
processor_kwargs = cls.prepare_processor_dict()
|
processor_kwargs = cls.prepare_processor_dict()
|
||||||
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
|
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
|
||||||
processor.save_pretrained(cls.tmpdirname)
|
processor.save_pretrained(cls.tmpdirname)
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
image_processor = LlavaNextImageProcessor()
|
image_processor = LlavaNextImageProcessor()
|
||||||
video_processor = LlavaNextVideoImageProcessor()
|
video_processor = LlavaNextVideoImageProcessor()
|
||||||
tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
|
tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
|
||||||
processor_kwargs = cls.prepare_processor_dict()
|
processor_kwargs = cls.prepare_processor_dict()
|
||||||
|
|
||||||
processor = LlavaNextVideoProcessor(
|
processor = LlavaNextVideoProcessor(
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
image_processor = LlavaOnevisionImageProcessor()
|
image_processor = LlavaOnevisionImageProcessor()
|
||||||
video_processor = LlavaOnevisionVideoProcessor()
|
video_processor = LlavaOnevisionVideoProcessor()
|
||||||
tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
|
||||||
processor_kwargs = cls.prepare_processor_dict()
|
processor_kwargs = cls.prepare_processor_dict()
|
||||||
|
|
||||||
processor = LlavaOnevisionProcessor(
|
processor = LlavaOnevisionProcessor(
|
||||||
|
|||||||
@@ -290,3 +290,29 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
||||||
self.assertIn("input_ids", inputs_image)
|
self.assertIn("input_ids", inputs_image)
|
||||||
self.assertTrue(len(inputs_image["input_ids"]) == 5)
|
self.assertTrue(len(inputs_image["input_ids"]) == 5)
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=3,
|
||||||
|
)
|
||||||
|
|||||||
@@ -360,3 +360,29 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
|
len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
|
||||||
and len(inputs[self.text_input_name][1]) < 76
|
and len(inputs[self.text_input_name][1]) < 76
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
image_input = [[image_input[0]], [image_input[1]]]
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=3,
|
||||||
|
)
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
|
image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
|
||||||
image_processor.image_seq_length = 0 # TODO: raushan fix me in #37342
|
image_processor.image_seq_length = 0 # TODO: raushan fix me in #37342
|
||||||
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||||
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
processor.save_pretrained(cls.tmpdirname)
|
processor.save_pretrained(cls.tmpdirname)
|
||||||
cls.image_token = processor.image_token
|
cls.image_token = processor.image_token
|
||||||
@@ -59,7 +60,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
inputs = processor(
|
inputs = processor(
|
||||||
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
|
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
|
||||||
)
|
)
|
||||||
self.assertEqual(len(inputs["input_ids"][0]), 112 + 14)
|
self.assertEqual(len(inputs["input_ids"][0]), 112)
|
||||||
|
|
||||||
def test_text_with_image_tokens(self):
|
def test_text_with_image_tokens(self):
|
||||||
image_processor = self.get_component("image_processor")
|
image_processor = self.get_component("image_processor")
|
||||||
|
|||||||
@@ -397,3 +397,29 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
|
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
|
||||||
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
|
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
|
||||||
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=20,
|
||||||
|
)
|
||||||
|
|||||||
@@ -435,7 +435,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
image_processor = self.get_component("image_processor")
|
image_processor = self.get_component("image_processor")
|
||||||
tokenizer = self.get_component("tokenizer")
|
tokenizer = self.get_component("tokenizer")
|
||||||
|
|
||||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
@@ -445,14 +446,14 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
text=input_str,
|
text=input_str,
|
||||||
images=image_input,
|
images=image_input,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
padding="longest",
|
padding="max_length",
|
||||||
max_length=76,
|
max_length=76,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
max_image_size={"longest_edge": 30},
|
max_image_size={"longest_edge": 300},
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(inputs["pixel_values"].shape[2], 3)
|
self.assertEqual(inputs["pixel_values"].shape[2], 3)
|
||||||
self.assertEqual(inputs["pixel_values"].shape[3], 30)
|
self.assertEqual(inputs["pixel_values"].shape[3], 300)
|
||||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@@ -529,3 +530,29 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError) as context:
|
with self.assertRaises(ValueError) as context:
|
||||||
processor(text=texts, images=None)
|
processor(text=texts, images=None)
|
||||||
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
|
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
|
||||||
|
|
||||||
|
def test_special_mm_token_truncation(self):
|
||||||
|
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||||
|
image_input = self.prepare_image_inputs(batch_size=2)
|
||||||
|
image_input = [[image_input[0]], [image_input[1]]]
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=None,
|
||||||
|
padding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = processor(
|
||||||
|
text=input_str,
|
||||||
|
images=image_input,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
padding=True,
|
||||||
|
max_length=20,
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user