VLMs: patch_size -> num_image_tokens in processing (#33424)

* use num additional tokens

* fix copies + docs

* another fix copies :)

* add docs

* move order for BC
This commit is contained in:
Raushan Turganbay
2024-11-18 13:21:07 +01:00
committed by GitHub
parent 3ee24e2208
commit 1646ffb4d1
17 changed files with 131 additions and 15 deletions

View File

@@ -558,12 +558,14 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
processor.num_additional_image_tokens = 1
inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
processor.num_additional_image_tokens = None
inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
self.assertTrue(inputs.input_ids.shape[-1] == 19)
@@ -586,12 +588,14 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
processor.num_additional_image_tokens = 1
inputs_expanded = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2652)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
processor.num_additional_image_tokens = None
inputs = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
self.assertTrue(inputs.input_ids.shape[-1] == 19)
@@ -624,6 +628,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
processor.num_additional_image_tokens = 1
inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
torch_device, torch.float16
)
@@ -632,6 +637,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
processor.num_additional_image_tokens = None
inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
torch_device, torch.float16
)