|
|
|
@@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
def test_qwen2_small_model_integration_generate(self):
|
|
|
|
def test_qwen2_small_model_integration_generate(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
@@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
prompt = (
|
|
|
|
prompt = (
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
|
|
|
with torch.no_grad():
|
|
|
|
with torch.no_grad():
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
|
|
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
|
|
|
)
|
|
|
|
)
|
|
|
|
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
|
|
|
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(decoded_output, expected_output)
|
|
|
|
self.assertEqual(decoded_output, expected_output)
|
|
|
|
|
|
|
|
|
|
|
|
def test_qwen2_small_model_integration_forward(self):
|
|
|
|
def test_qwen2_small_model_integration_forward(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
@@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
prompt = (
|
|
|
|
prompt = (
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
|
|
|
|
|
|
|
|
|
|
|
# Forward
|
|
|
|
# Forward
|
|
|
|
with torch.inference_mode():
|
|
|
|
with torch.inference_mode():
|
|
|
|
@@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
actual_logits = output.logits[0, -1, :5].cpu()
|
|
|
|
actual_logits = output.logits[0, -1, :5].cpu()
|
|
|
|
expected_logits_all = Expectations(
|
|
|
|
expected_logits_all = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
|
|
|
|
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
|
|
|
|
("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125, 7.0938], dtype=torch.bfloat16),
|
|
|
|
("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562, 6.9219], dtype=torch.float16),
|
|
|
|
("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125, 6.9375], dtype=torch.bfloat16),
|
|
|
|
("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484, 6.9141], dtype=torch.float16),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_logits = expected_logits_all.get_expectation()
|
|
|
|
expected_logits = expected_logits_all.get_expectation()
|
|
|
|
@@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
def test_qwen2_small_model_integration_generate_text_only(self):
|
|
|
|
def test_qwen2_small_model_integration_generate_text_only(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
|
|
|
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
|
|
|
with torch.no_grad():
|
|
|
|
with torch.no_grad():
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
@@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
|
|
|
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
|
|
|
("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
|
|
|
("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
|
|
|
|
("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
|
|
|
|
("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
@@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
def test_qwen2_small_model_integration_generate_chat_template(self):
|
|
|
|
def test_qwen2_small_model_integration_generate_chat_template(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
messages = [
|
|
|
|
messages = [
|
|
|
|
{
|
|
|
|
{
|
|
|
|
@@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
inputs = processor.apply_chat_template(
|
|
|
|
inputs = processor.apply_chat_template(
|
|
|
|
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
|
|
|
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
|
|
|
).to(torch_device, dtype=torch.bfloat16)
|
|
|
|
).to(torch_device, dtype=torch.float16)
|
|
|
|
with torch.no_grad():
|
|
|
|
with torch.no_grad():
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
|
|
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
|
|
|
)
|
|
|
|
)
|
|
|
|
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
|
|
|
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(decoded_output, expected_output)
|
|
|
|
self.assertEqual(decoded_output, expected_output)
|
|
|
|
|
|
|
|
|
|
|
|
@require_deterministic_for_xpu
|
|
|
|
@require_deterministic_for_xpu
|
|
|
|
def test_qwen2_small_model_integration_batched_generate(self):
|
|
|
|
def test_qwen2_small_model_integration_batched_generate(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
# Prepare inputs
|
|
|
|
# Prepare inputs
|
|
|
|
prompt = [
|
|
|
|
prompt = [
|
|
|
|
@@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
|
|
|
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
|
|
|
|
|
|
|
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
|
|
|
torch_device, dtype=torch.bfloat16
|
|
|
|
torch_device, dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
@@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
# Check first output
|
|
|
|
# Check first output
|
|
|
|
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
|
|
|
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
|
|
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
|
|
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
|
|
|
|
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
|
|
|
|
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
|
|
|
|
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
@@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
|
|
|
|
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
# Prepare inputs
|
|
|
|
# Prepare inputs
|
|
|
|
prompt = [
|
|
|
|
prompt = [
|
|
|
|
@@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
|
|
|
torch_device, dtype=torch.bfloat16
|
|
|
|
torch_device, dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
@@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
def test_qwen2_small_model_integration_interleaved_images_videos(self):
|
|
|
|
def test_qwen2_small_model_integration_interleaved_images_videos(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
|
|
|
|
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
|
|
|
|
)
|
|
|
|
)
|
|
|
|
messages = [
|
|
|
|
messages = [
|
|
|
|
[
|
|
|
|
[
|
|
|
|
@@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
return_tensors="pt",
|
|
|
|
return_tensors="pt",
|
|
|
|
padding=True,
|
|
|
|
padding=True,
|
|
|
|
num_frames=8,
|
|
|
|
num_frames=8,
|
|
|
|
).to(torch_device, dtype=torch.bfloat16)
|
|
|
|
).to(torch_device, dtype=torch.float16)
|
|
|
|
|
|
|
|
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
|
|
|
|
|
|
|
|
@@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
|
|
|
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
|
|
|
("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
|
|
|
("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
|
|
|
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
|
|
|
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
|
|
|
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
@@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
# Check third output
|
|
|
|
# Check third output
|
|
|
|
decoded_output = processor.decode(output[2], skip_special_tokens=True)
|
|
|
|
decoded_output = processor.decode(output[2], skip_special_tokens=True)
|
|
|
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
|
|
|
expected_output = (
|
|
|
|
|
|
|
|
"user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace."
|
|
|
|
|
|
|
|
)
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
def test_llama_small_model_integration_generate(self):
|
|
|
|
def test_llama_small_model_integration_generate(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
@@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
prompt = (
|
|
|
|
prompt = (
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
|
|
|
with torch.no_grad():
|
|
|
|
with torch.no_grad():
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
@@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
def test_llama_small_model_integration_forward(self):
|
|
|
|
def test_llama_small_model_integration_forward(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
@@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
prompt = (
|
|
|
|
prompt = (
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
|
|
|
|
|
|
|
|
|
|
|
# Forward
|
|
|
|
# Forward
|
|
|
|
with torch.inference_mode():
|
|
|
|
with torch.inference_mode():
|
|
|
|
@@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
expected_logits_all = Expectations(
|
|
|
|
expected_logits_all = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
|
|
|
|
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
|
|
|
|
("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
|
|
|
|
("cuda", 7): torch.tensor([-9.8750, -0.4861, 1.4648, -10.3359, -10.3359], dtype=torch.float16),
|
|
|
|
("cuda", 8): torch.tensor([-9.8750, -0.5117, 1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
|
|
|
|
("cuda", 8): torch.tensor([-9.8906, -0.4995, 1.4473, -10.3359, -10.3438], dtype=torch.float16),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
|
|
|
|
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)
|
|
|
|
|
|
|
|
|
|
|
|
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
|
|
|
|
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
|
|
|
|
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
|
|
|
|
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
|
|
|
|
@@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
def test_llama_small_model_integration_generate_text_only(self):
|
|
|
|
def test_llama_small_model_integration_generate_text_only(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
|
|
|
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
|
|
|
with torch.no_grad():
|
|
|
|
with torch.no_grad():
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
|
|
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
|
|
|
)
|
|
|
|
)
|
|
|
|
expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
|
|
|
|
|
|
|
|
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
|
|
|
|
|
|
|
|
("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(decoded_output, expected_output)
|
|
|
|
self.assertEqual(decoded_output, expected_output)
|
|
|
|
|
|
|
|
|
|
|
|
def test_llama_small_model_integration_generate_chat_template(self):
|
|
|
|
def test_llama_small_model_integration_generate_chat_template(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
messages = [
|
|
|
|
messages = [
|
|
|
|
{
|
|
|
|
{
|
|
|
|
@@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
inputs = processor.apply_chat_template(
|
|
|
|
inputs = processor.apply_chat_template(
|
|
|
|
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
|
|
|
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
|
|
|
).to(torch_device, dtype=torch.bfloat16)
|
|
|
|
).to(torch_device, dtype=torch.float16)
|
|
|
|
with torch.no_grad():
|
|
|
|
with torch.no_grad():
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
decoded_output = processor.decode(
|
|
|
|
@@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
def test_llama_small_model_integration_batched_generate(self):
|
|
|
|
def test_llama_small_model_integration_batched_generate(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
# Prepare inputs
|
|
|
|
# Prepare inputs
|
|
|
|
prompt = [
|
|
|
|
prompt = [
|
|
|
|
@@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
|
|
|
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
|
|
|
|
|
|
|
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
|
|
|
torch_device, dtype=torch.bfloat16
|
|
|
|
torch_device, dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
@@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
|
|
|
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
|
|
|
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
|
|
|
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
|
|
|
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
|
|
|
|
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
# Check second output
|
|
|
|
# Check second output
|
|
|
|
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
|
|
|
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
|
|
|
expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters' # fmt: skip
|
|
|
|
expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
def test_llama_small_model_integration_batched_generate_multi_image(self):
|
|
|
|
def test_llama_small_model_integration_batched_generate_multi_image(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
|
|
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
# Prepare inputs
|
|
|
|
# Prepare inputs
|
|
|
|
prompt = [
|
|
|
|
prompt = [
|
|
|
|
@@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
|
|
|
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
|
|
|
torch_device, dtype=torch.bfloat16
|
|
|
|
torch_device, dtype=torch.float16
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
@@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
# Check first output
|
|
|
|
# Check first output
|
|
|
|
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
|
|
|
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
|
|
|
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
|
|
|
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
|
|
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace." # fmt: skip
|
|
|
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
# Check second output
|
|
|
|
# Check second output
|
|
|
|
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
|
|
|
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
|
|
|
expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences' # fmt: skip
|
|
|
|
expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
|
|
|
|
self.assertEqual(
|
|
|
|
self.assertEqual(
|
|
|
|
decoded_output,
|
|
|
|
decoded_output,
|
|
|
|
expected_output,
|
|
|
|
expected_output,
|
|
|
|
@@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
def test_llama_small_model_integration_interleaved_images_videos(self):
|
|
|
|
def test_llama_small_model_integration_interleaved_images_videos(self):
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
model = InternVLForConditionalGeneration.from_pretrained(
|
|
|
|
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
|
|
|
|
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
|
|
|
|
)
|
|
|
|
)
|
|
|
|
messages = [
|
|
|
|
messages = [
|
|
|
|
[
|
|
|
|
[
|
|
|
|
@@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
return_tensors="pt",
|
|
|
|
return_tensors="pt",
|
|
|
|
padding=True,
|
|
|
|
padding=True,
|
|
|
|
num_frames=8,
|
|
|
|
num_frames=8,
|
|
|
|
).to(torch_device, dtype=torch.bfloat16)
|
|
|
|
).to(torch_device, dtype=torch.float16)
|
|
|
|
|
|
|
|
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
|
|
|
|
|
|
|
|
|
|
|
@@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
|
|
|
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
|
|
|
("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
|
|
|
("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
|
|
|
|
("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
|
|
|
|
("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
@@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
|
|
|
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
|
|
|
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
|
|
|
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
|
|
|
|
("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
|
|
|
("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
@@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
expected_outputs = Expectations(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
|
|
|
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
|
|
|
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
|
|
|
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
|
|
|
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
|
|
|
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
) # fmt: skip
|
|
|
|
) # fmt: skip
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
expected_output = expected_outputs.get_expectation()
|
|
|
|
|