qwen2.5vl: fix bugs when using flash2+bf16 or num_return_sequences>1 (#36083)

* qwen2.5vl: fix bugs when using flash2+bf16 or num_return_sequences>1

* fix

* fix

* fix

* fix

* add tests

* fix test bugs

* fix

* fix failed tests

* fix
This commit is contained in:
gewenbin0992
2025-02-13 18:35:28 +08:00
committed by GitHub
parent d419862889
commit 6a1ab634b6
6 changed files with 299 additions and 7 deletions

View File

@@ -171,7 +171,9 @@ class Qwen2_5_VLVisionText2TextModelTester:
input_ids[:, -1] = self.pad_token_id
input_ids[input_ids == self.video_token_id] = self.pad_token_id
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
labels = torch.zeros(
(self.batch_size, self.seq_length),
dtype=torch.long,
@@ -426,6 +428,26 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_expand(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_wo_image(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

View File

@@ -167,7 +167,9 @@ class Qwen2VLVisionText2TextModelTester:
input_ids[:, -1] = self.pad_token_id
input_ids[input_ids == self.video_token_id] = self.pad_token_id
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
labels = torch.zeros(
(self.batch_size, self.seq_length),
dtype=torch.long,
@@ -435,6 +437,26 @@ class Qwen2VLIntegrationTest(unittest.TestCase):
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_expand(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_wo_image(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(