@@ -241,19 +241,19 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
@@ -311,7 +311,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
||||
|
||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||
@@ -333,7 +333,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_single(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "rhymes-ai/Aria"
|
||||
|
||||
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
@@ -355,7 +355,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "rhymes-ai/Aria"
|
||||
|
||||
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
@@ -382,7 +382,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_batch(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
||||
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
||||
prompts = [
|
||||
@@ -408,7 +408,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched_regression(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "rhymes-ai/Aria"
|
||||
|
||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||
@@ -442,7 +442,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
|
||||
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
|
||||
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
||||
@@ -460,7 +460,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = model.eval()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||
]
|
||||
|
||||
@@ -253,7 +253,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@@ -306,19 +306,19 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
@@ -345,7 +345,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
||||
|
||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||
@@ -364,7 +364,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_single(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||
@@ -386,7 +386,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||
@@ -413,7 +413,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_batch(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
||||
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
||||
prompts = [
|
||||
@@ -441,7 +441,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched_regression(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
|
||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||
@@ -478,7 +478,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
|
||||
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
||||
@@ -496,7 +496,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = model.eval()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||
]
|
||||
@@ -617,7 +617,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=50)
|
||||
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
|
||||
EXPECTED_GENERATION = "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip
|
||||
EXPECTED_GENERATION = "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip
|
||||
self.assertEqual(output, EXPECTED_GENERATION)
|
||||
|
||||
@slow
|
||||
|
||||
@@ -237,7 +237,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -234,7 +234,7 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -231,7 +231,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -458,7 +458,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
||||
|
||||
def test_generate_quality(self):
|
||||
"""
|
||||
Simple test to check the quality of the model by comparing the the generated tokens with the expected tokens
|
||||
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
|
||||
@@ -1090,7 +1090,7 @@ class ProcessorTesterMixin:
|
||||
]
|
||||
]
|
||||
|
||||
def dummmy_sample_indices_fn(metadata, **fn_kwargs):
|
||||
def dummy_sample_indices_fn(metadata, **fn_kwargs):
|
||||
# sample only the first two frame always
|
||||
return [0, 1]
|
||||
|
||||
@@ -1099,7 +1099,7 @@ class ProcessorTesterMixin:
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
sample_indices_fn=dummmy_sample_indices_fn,
|
||||
sample_indices_fn=dummy_sample_indices_fn,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
|
||||
@@ -429,7 +429,7 @@ class ImageFeatureExtractionTester(unittest.TestCase):
|
||||
self.assertEqual(len(videos_list), 1)
|
||||
self.assertTrue(np.array_equal(videos_list[0][0], images))
|
||||
|
||||
# Test a 4d array of images is converted to a a list of 1 video
|
||||
# Test a 4d array of images is converted to a list of 1 video
|
||||
images = np.random.randint(0, 256, (4, 16, 32, 3))
|
||||
videos_list = make_batched_videos(images)
|
||||
self.assertIsInstance(videos_list[0], list)
|
||||
|
||||
Reference in New Issue
Block a user