qwen2.5vl: fix bugs when using flash2+bf16 or num_return_sequences>1 (#36083)
* qwen2.5vl: fix bugs when using flash2+bf16 or num_return_sequences>1 * fix * fix * fix * fix * add tests * fix test bugs * fix * fix failed tests * fix
This commit is contained in:
@@ -171,7 +171,9 @@ class Qwen2_5_VLVisionText2TextModelTester:
|
||||
input_ids[:, -1] = self.pad_token_id
|
||||
input_ids[input_ids == self.video_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.image_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
|
||||
input_ids[:, self.num_image_tokens] = self.image_token_id
|
||||
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
|
||||
labels = torch.zeros(
|
||||
(self.batch_size, self.seq_length),
|
||||
dtype=torch.long,
|
||||
@@ -426,6 +428,26 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_expand(self):
|
||||
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_batch_wo_image(self):
|
||||
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
|
||||
@@ -167,7 +167,9 @@ class Qwen2VLVisionText2TextModelTester:
|
||||
input_ids[:, -1] = self.pad_token_id
|
||||
input_ids[input_ids == self.video_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.image_token_id] = self.pad_token_id
|
||||
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
|
||||
input_ids[:, self.num_image_tokens] = self.image_token_id
|
||||
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
|
||||
labels = torch.zeros(
|
||||
(self.batch_size, self.seq_length),
|
||||
dtype=torch.long,
|
||||
@@ -435,6 +437,26 @@ class Qwen2VLIntegrationTest(unittest.TestCase):
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_expand(self):
|
||||
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
|
||||
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_small_model_integration_test_batch_wo_image(self):
|
||||
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
|
||||
Reference in New Issue
Block a user