Update some tests for torch 2.7.1 (#38701)

* fix 1

* fix 2

* fix 3

* fix 4

* fp16

* break

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2025-06-10 11:46:52 +02:00
committed by GitHub
parent afdb821318
commit 04cdf83244
5 changed files with 88 additions and 70 deletions

View File

@@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
self.assertEqual(decoded_output, expected_output)
def test_qwen2_small_model_integration_forward(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
# Forward
with torch.inference_mode():
@@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
actual_logits = output.logits[0, -1, :5].cpu()
expected_logits_all = Expectations(
{
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125, 7.0938], dtype=torch.bfloat16),
("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125, 6.9375], dtype=torch.bfloat16),
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562, 6.9219], dtype=torch.float16),
("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484, 6.9141], dtype=torch.float16),
}
) # fmt: skip
expected_logits = expected_logits_all.get_expectation()
@@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_generate_text_only(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
decoded_output = processor.decode(
@@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_generate_chat_template(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
messages = [
{
@@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
self.assertEqual(decoded_output, expected_output)
@require_deterministic_for_xpu
def test_qwen2_small_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
# Check first output
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
self.assertEqual(
decoded_output,
expected_output,
@@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
)
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_interleaved_images_videos(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
)
messages = [
[
@@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
return_tensors="pt",
padding=True,
num_frames=8,
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
@@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
# Check third output
decoded_output = processor.decode(output[2], skip_special_tokens=True)
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
expected_output = (
"user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace."
)
self.assertEqual(
decoded_output,
expected_output,
@@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
@@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_forward(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
# Forward
with torch.inference_mode():
@@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_logits_all = Expectations(
{
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
("cuda", 8): torch.tensor([-9.8750, -0.5117, 1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
("cuda", 7): torch.tensor([-9.8750, -0.4861, 1.4648, -10.3359, -10.3359], dtype=torch.float16),
("cuda", 8): torch.tensor([-9.8906, -0.4995, 1.4473, -10.3359, -10.3438], dtype=torch.float16),
}
) # fmt: skip
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
@@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_generate_text_only(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
expected_outputs = Expectations(
{
("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
}
)
expected_output = expected_outputs.get_expectation()
self.assertEqual(decoded_output, expected_output)
def test_llama_small_model_integration_generate_chat_template(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
messages = [
{
@@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
@@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
@@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
# Check second output
decoded_output = processor.decode(output[1], skip_special_tokens=True)
expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters' # fmt: skip
expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
self.assertEqual(
decoded_output,
expected_output,
@@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_batched_generate_multi_image(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
)
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
# Check first output
decoded_output = processor.decode(output[0], skip_special_tokens=True)
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace." # fmt: skip
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
self.assertEqual(
decoded_output,
expected_output,
@@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
# Check second output
decoded_output = processor.decode(output[1], skip_special_tokens=True)
expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences' # fmt: skip
expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
self.assertEqual(
decoded_output,
expected_output,
@@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_interleaved_images_videos(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
)
messages = [
[
@@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
return_tensors="pt",
padding=True,
num_frames=8,
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()