Add Qwen2.5-Omni (#36752)

* Add qwen2.5-omni

* Remove einops dependency

* Add torchdiffeq dependency

* Sort init

* Add torchdiffeq to extras['diffeq']

* Fix repo consistency

* use cached_file

* del odeint

* renew pytest

* format

* Remove torchdiffeq

* format

* fixed batch infer bug

* Change positional_embedding to parameter

* Change default speaker

* Config revision

* Use modular & code clean

* code clean

* decouple padding with model & code cleaning

* sort init

* fix

* fix

* Second code review

* fix

* fix

* rename vars to full name + some comments

* update pytest

* Code clean & fix

* fix

* style

* more clean up

* fixup

* smaller vision model in tests

* fix processor test

* deflake a bit the tests (still flaky though)

* de-flake tests finally + add generation mixin

* final nits i hope

* make sure processor tests are complete

* replace with Qwen2_5OmniForConditionalGeneration

* fix tests after updating ckpt

* fix typos when cleaning, also we can't change ckpt

* fixup

* images and videos kwargs for processor

* thinker and talker loadable from hub ckpt

* address comments and update tests after rebase

* fixup

* skip for now

* fixup

* fixup

* remove torch dependency in processors

---------

Co-authored-by: lvyuanjun.lyj <lvyuanjun.lyj@alibaba-inc.con>
Co-authored-by: feizi.wx <feizi.wx@alibaba-inc.com>
Co-authored-by: raushan <raushan@huggingface.co>
This commit is contained in:
BakerBunker
2025-04-14 18:36:41 +08:00
committed by GitHub
parent ac1df5fccd
commit 4b8c6d4cf8
35 changed files with 12063 additions and 49 deletions

View File

@@ -129,6 +129,7 @@ VLM_CLASS_NAMES = [
"gemma3",
"mistral3",
"chameleon",
"qwen2_5_omni",
]

View File

View File

@@ -0,0 +1,606 @@
# coding=utf-8
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Qwen2.5-Omni model."""
import tempfile
import unittest
from io import BytesIO
from urllib.request import urlopen
import librosa
import requests
from parameterized import parameterized
from transformers import (
AutoProcessor,
Qwen2_5OmniForConditionalGeneration,
Qwen2_5OmniThinkerConfig,
Qwen2_5OmniThinkerForConditionalGeneration,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
cleanup,
require_flash_attn,
require_torch,
require_torch_gpu,
require_torch_sdpa,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
ids_tensor,
)
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class Qwen2_5OmniThinkerForConditionalGenerationTester:
def __init__(
self,
parent,
batch_size=3,
feat_seq_length=30,
num_channels=3,
image_size=14,
seq_length=39,
vision_config={
"depth": 2,
"embed_dim": 32,
"hidden_act": "quick_gelu",
"hidden_size": 32,
"out_hidden_size": 32,
"intermediate_size": 24,
"mlp_ratio": 4,
"num_heads": 4,
"patch_size": 14,
"spatial_merge_size": 1,
"temporal_patch_size": 2,
"fullatt_block_indexes": [0],
"initializer_range": 0.02,
},
audio_config={
"model_type": "qwen_omni_thinker_audio_encoder",
"d_model": 32,
"encoder_attention_heads": 4,
"encoder_ffn_dim": 32,
"encoder_layers": 2,
"num_mel_bins": 20,
"max_source_positions": 1500,
"initializer_range": 0.02,
"n_window": 100,
"output_dim": 32,
},
text_config={
"rope_scaling": {"mrope_section": [1, 1, 2], "rope_type": "default", "type": "default"},
"vocab_size": 99,
"hidden_size": 32,
"intermediate_size": 37,
"num_hidden_layers": 4,
"num_attention_heads": 4,
"num_key_value_heads": 2,
"hidden_act": "silu",
"max_position_embeddings": 1024,
"rms_norm_eps": 1e-06,
"use_cache": True,
"tie_word_embeddings": False,
"rope_theta": 1000000.0,
"use_sliding_window": False,
"sliding_window": 50,
"max_window_layers": 3,
"attention_dropout": 0.0,
"pad_token_id": 0,
"initializer_range": 0.02,
},
audio_token_index=1,
image_token_index=2,
video_token_index=3,
position_id_per_seconds=25,
seconds_per_chunk=2,
audio_start_token_id=4,
audio_end_token_id=5,
user_token_id=6,
vision_start_token_id=7,
vision_end_token_id=8,
initializer_range=0.02,
):
self.parent = parent
self.audio_config = audio_config
self.vision_config = vision_config
self.text_config = text_config
self.audio_token_index = audio_token_index
self.image_token_index = image_token_index
self.video_token_index = video_token_index
self.position_id_per_seconds = position_id_per_seconds
self.seconds_per_chunk = seconds_per_chunk
self.audio_start_token_id = audio_start_token_id
self.audio_end_token_id = audio_end_token_id
self.vision_start_token_id = vision_start_token_id
self.vision_end_token_id = vision_end_token_id
self.user_token_id = user_token_id
self.initializer_range = initializer_range
self.batch_size = batch_size
self.feat_seq_length = feat_seq_length
self.num_channels = num_channels
self.image_size = image_size
self.seq_length = seq_length
self.is_training = False
# Used from `self.model_tester` by common model tests
self.num_hidden_layers = self.text_config["num_hidden_layers"]
self.hidden_size = self.text_config["hidden_size"]
self.num_attention_heads = self.text_config["num_attention_heads"]
self.vocab_size = self.text_config["vocab_size"]
def get_config(self):
return Qwen2_5OmniThinkerConfig(
audio_config=self.audio_config,
vision_config=self.vision_config,
text_config=self.text_config,
audio_token_index=self.audio_token_index,
image_token_index=self.image_token_index,
video_token_index=self.video_token_index,
position_id_per_seconds=self.position_id_per_seconds,
seconds_per_chunk=self.seconds_per_chunk,
audio_start_token_id=self.audio_start_token_id,
audio_end_token_id=self.audio_end_token_id,
vision_start_token_id=self.vision_start_token_id,
vision_end_token_id=self.vision_end_token_id,
user_token_id=self.user_token_id,
initializer_range=self.initializer_range,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.vision_config.patch_size
temporal_patch_size = config.vision_config.temporal_patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2) * temporal_patch_size,
]
)
pixel_grid_thw = torch.LongTensor(
[[1, self.image_size / patch_size, self.image_size / patch_size]] * self.batch_size
).to(pixel_values.device)
input_features_values = floats_tensor(
[self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length]
)
feature_attention_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device)
return config, pixel_values, pixel_grid_thw, input_features_values, feature_attention_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, pixel_grid_thw, input_features_values, feature_attention_mask = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.get_text_config().vocab_size - 3) + 3
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
# Make sure no other tokens are set to special, to prevetn flakiness
tokens_to_replace = torch.tensor(
[
config.image_token_index,
config.audio_token_index,
config.audio_start_token_id,
config.audio_end_token_id,
config.vision_start_token_id,
config.vision_end_token_id,
],
device=input_ids.device,
)
input_ids[torch.isin(input_ids, tokens_to_replace)] = config.text_config.pad_token_id
attention_mask[:, :1] = 0
# Audio token placeholders should be wrapped in start and end token ids
audio_feat_length = ((self.feat_seq_length - 1) // 2 + 1 - 2) // 2 + 1
input_ids[:, 1] = config.audio_start_token_id
input_ids[:, 2 : (2 + audio_feat_length)] = config.audio_token_index
input_ids[:, 2 + audio_feat_length] = config.audio_end_token_id
# Image token placeholders should be wrapped in start and end token ids
input_ids[:, -4:-1] = torch.tensor(
[config.vision_start_token_id, config.image_token_index, config.vision_end_token_id]
)
inputs_dict = {
"input_features": input_features_values,
"feature_attention_mask": feature_attention_mask,
"input_ids": input_ids,
"attention_mask": attention_mask,
"image_grid_thw": pixel_grid_thw,
"pixel_values": pixel_values,
}
return config, inputs_dict
def create_and_check_qwenomnithinker_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
model = Qwen2_5OmniThinkerForConditionalGeneration(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type=torch_device, dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
pixel_values=pixel_values.to(torch.bfloat16),
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@require_torch
class Qwen2_5OmniThinkerForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `Qwen2_5OmniThinkerForConditionalGeneration`.
"""
all_model_classes = (Qwen2_5OmniThinkerForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Qwen2_5OmniThinkerForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = Qwen2_5OmniThinkerForConditionalGenerationTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2_5OmniThinkerConfig, has_text_modality=False)
@unittest.skip(reason="Cpu not yet supported because in QwenOmniThinker models")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Disk offload bin not yet supported because in QwenOmniThinker models")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Disk offload safetensors not yet supported because in QwenOmniThinker models")
def test_disk_offload_safetensors(self):
pass
@unittest.skip(reason="Correct missing keys not yet supported because in QwenOmniThinker models")
def test_correct_missing_keys(self):
pass
@unittest.skip(reason="Compile not yet supported because in QwenOmniThinker models")
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Sdpa dispatch not yet supported because in QwenOmniThinker models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip(reason="QwenOmniThinker does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="QwenOmniThinker does not support output_hidden_states test")
def test_model_outputs_equivalence(self):
pass
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
# overwrite because Qwen2 is audio+text model (not vision+text)
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.model._supports_sdpa else "eager"
audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
vision_attn = "sdpa" if model.visual._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model.model.config._attn_implementation == text_attn)
self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
self.assertTrue(model.visual.config._attn_implementation == vision_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.model.config._attn_implementation == "eager")
self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
self.assertTrue(model_eager.visual.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
@parameterized.expand([("greedy", 1), ("beam search", 2)])
@unittest.skip("Cannot generate from inputs embeds")
def test_generate_from_inputs_embeds(self):
pass
@unittest.skip("Cannot do contrastive generation, has custom `generate()`")
def test_contrastive_generate(self):
pass
@unittest.skip("Cannot do contrastive generation, has custom `generate()`")
def test_contrastive_generate_dict_outputs_use_cache(self):
pass
@unittest.skip("Cannot do contrastive generation, has custom `generate()`")
def test_contrastive_generate_low_memory(self):
pass
@unittest.skip("Cannot do constraint generation, has custom `generate()`")
def test_constrained_beam_search_generate_dict_output(self):
pass
@unittest.skip("Cannot do dola generation, has custom `generate()`")
def test_dola_decoding_sample(self):
pass
@unittest.skip("Cannot generate from inputs embeds")
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_generate_compile_model_forward(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_generate_compilation_all_outputs(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_generate_with_static_cache(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_custom_4d_attention_mask(self):
pass
@require_torch
class Qwen2_5OmniModelIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
self.audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
self.audio_url_additional = (
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"
)
self.image_url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
self.messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio_url": self.audio_url},
{"type": "image", "image_url": self.image_url},
{"type": "text", "text": "What's that sound and what kind of dog is this?"},
],
}
]
self.raw_audio, _ = librosa.load(
BytesIO(urlopen(self.audio_url).read()), sr=self.processor.feature_extractor.sampling_rate
)
self.raw_audio_additional, _ = librosa.load(
BytesIO(urlopen(self.audio_url_additional).read()), sr=self.processor.feature_extractor.sampling_rate
)
self.raw_image = Image.open(requests.get(self.image_url, stream=True).raw)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
def test_small_model_integration_test(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", torch_dtype=torch.float32, device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text], audio=[self.raw_audio], images=[self.raw_image], return_tensors="pt", padding=True
)
expected_input_ids = torch.tensor(
[
151644,
8948,
198,
2610,
525,
264,
10950,
17847,
13,
151645,
198,
151644,
872,
198,
151647,
151646,
151648,
]
)
assert torch.allclose(expected_input_ids, inputs.input_ids[0][:17], atol=3e-3)
expected_pixel_slice = torch.tensor(
[
[0.8792, 0.8792, 0.9084],
[1.1858, 1.1858, 1.2296],
[1.2004, 1.2004, 1.2150],
[1.4340, 1.4340, 1.4194],
[1.3902, 1.4048, 1.4194],
[1.5216, 1.5362, 1.5362],
],
dtype=torch.float32,
device="cpu",
)
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
# verify generation
inputs = inputs.to(torch_device)
output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False)
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever."
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", torch_dtype=torch.float32, device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text, text],
audio=[self.raw_audio, self.raw_audio],
images=[self.raw_image, self.raw_image],
return_tensors="pt",
padding=True,
).to(torch_device)
output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False)
EXPECTED_DECODED_TEXT = [
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.",
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.",
]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_multiturn(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", torch_dtype=torch.float32, device_map="auto"
)
messages = [
self.messages[0],
{
"role": "assistant",
"content": "The sound is glass shattering, and the dog appears to be a Labrador Retriever.",
},
{
"role": "user",
"content": [
{"type": "audio", "audio_url": self.audio_url_additional},
{"type": "text", "text": "How about this one?"},
],
},
]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text],
audio=[self.raw_audio, self.raw_audio_additional],
images=[self.raw_image],
return_tensors="pt",
padding=True,
).to(torch_device)
output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False)
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.\nuser\nHow about this one?\nassistant\nThe sound is a cough."
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_w_audio(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", torch_dtype=torch.float32, device_map="auto"
)
audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
messages = [
{
"role": "system",
"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
},
{
"role": "user",
"content": [{"type": "audio", "audio": audio_url}],
},
]
audio, _ = librosa.load(BytesIO(urlopen(audio_url).read()), sr=self.processor.feature_extractor.sampling_rate)
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], audio=[audio], return_tensors="pt", padding=True).to(torch_device)
output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False)
EXPECTED_DECODED_TEXT = "system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nWell, I can't really guess your age and gender just from your voice. There are so many factors that can affect how a voice sounds, like the environment you're in, how you're feeling at the moment, and even the microphone you're using. But if you want to share more about your voice, like if it's high - pitched or low - pitched, that might give me a bit of an idea. So, what can you tell me about your voice?"
self.assertEqual(
self.processor.decode(output[0][0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
self.assertFalse(torch.isnan(output[1]).any().item())
@slow
@require_flash_attn
@require_torch_gpu
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text, text],
audio=[self.raw_audio, self.raw_audio],
images=[self.raw_image, self.raw_image],
return_tensors="pt",
padding=True,
).to(torch_device)
output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False)
EXPECTED_DECODED_TEXT = [
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.",
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.",
]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True)[0],
self.processor.batch_decode(output, skip_special_tokens=True)[1],
)

View File

@@ -0,0 +1,616 @@
# coding=utf-8
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import shutil
import tempfile
import unittest
import numpy as np
import pytest
from huggingface_hub import hf_hub_download
from transformers import (
AutoProcessor,
Qwen2_5OmniProcessor,
Qwen2Tokenizer,
WhisperFeatureExtractor,
)
from transformers.testing_utils import require_av, require_librosa, require_torch, require_torchaudio, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_torch_available():
import torch
if is_vision_available():
from transformers import Qwen2VLImageProcessor
@require_vision
@require_torch
@require_torchaudio
class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Qwen2_5OmniProcessor
# text + audio kwargs testing
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer(max_length=800, padding="max_length")
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer", max_length=800, padding="max_length")
else:
self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
processor = self.processor_class(
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
raw_speech = self.prepare_audio_inputs()
inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 800)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 800)
@require_torch
@require_vision
def test_structured_kwargs_audio_nested(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer()
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
processor = self.processor_class(
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer"]
raw_speech = self.prepare_audio_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"audio_kwargs": {"max_length": 800},
}
inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 2)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 2)
@require_torch
def test_unstructured_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer(max_length=117)
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer", max_length=117)
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
processor = self.processor_class(
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
raw_speech = self.prepare_audio_inputs()
inputs = processor(
text=input_str,
audio=raw_speech,
return_tensors="pt",
padding="max_length",
max_length=800,
)
if "input_ids" in inputs:
self.assertEqual(len(inputs["input_ids"][0]), 800)
elif "labels" in inputs:
self.assertEqual(len(inputs["labels"][0]), 800)
@require_torch
def test_doubly_passed_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer()
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor)
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
if hasattr(self, "get_tokenizer"):
tokenizer = self.get_tokenizer(max_length=117)
elif hasattr(self, "get_component"):
tokenizer = self.get_component("tokenizer", max_length=117)
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor)
@classmethod
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
processor_kwargs = cls.prepare_processor_dict()
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", **processor_kwargs)
processor.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def get_feature_extractor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).feature_extractor
@staticmethod
def prepare_processor_dict():
return {
"chat_template": "{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
}
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
def prepare_audio_inputs(self):
"""This function prepares a list of numpy audios."""
audio_inputs = [np.random.rand(160000) * 2 - 1] * 3 # batch-size=3
return audio_inputs
def test_save_load_pretrained_default(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = Qwen2_5OmniProcessor(
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
)
processor.save_pretrained(self.tmpdirname)
processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = Qwen2_5OmniProcessor(
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
)
image_input = self.prepare_image_inputs()
input_image_proc = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
for key in input_image_proc.keys():
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = Qwen2_5OmniProcessor(
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
audio_input = self.prepare_audio_inputs()
inputs = processor(text=input_str, images=image_input, audio=audio_input)
keys = list(inputs.keys())
self.assertListEqual(
keys,
[
"input_ids",
"attention_mask",
"pixel_values",
"image_grid_thw",
"feature_attention_mask",
"input_features",
],
)
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
# test if it raises when no text is passed
with pytest.raises(ValueError):
processor(images=image_input)
def test_model_input_names(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = Qwen2_5OmniProcessor(
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
video_inputs = self.prepare_video_inputs()
audio_input = self.prepare_audio_inputs()
inputs = processor(text=input_str, images=image_input, videos=video_inputs, audio=audio_input)
self.assertListEqual(sorted(inputs.keys()), sorted(processor.model_input_names))
@require_torch
def _test_apply_chat_template(
self,
modality: str,
batch_size: int,
return_tensors: str,
input_name: str,
processor_name: str,
input_data: list[str],
):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if processor_name not in self.processor_class.attributes:
self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors=return_tensors,
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=4, # by default no more than 4 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
self.assertEqual(len(out_dict[input_name]), batch_size * 1564)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict:
self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
messages = [
[
{
"role": "user",
"content": [
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1)
formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
self.assertListEqual(expected_output, formatted_prompt_tokenized)
out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
# Add video URL for return dict and load with `num_frames` arg
messages[0][0]["content"].append(
{
"type": "video",
"url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
}
)
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 9568)
# Load with `video_fps` arg
video_fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
video_fps=video_fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 23920)
# Load with `video_fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
video_fps=video_fps,
num_frames=num_frames,
)
# Load without any arg should load the whole video
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 717600)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
messages[0][0]["content"][-1] = {
"type": "video",
"url": [
"https://www.ilankelman.org/stopsigns/australia.jpg",
"https://www.ilankelman.org/stopsigns/australia.jpg",
],
}
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5704)
@require_av
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 145912)
@require_librosa
@require_av
@unittest.skip(
"@raushan: librosa can'r decode this audio in CI runner, fix after adding moviepy or another decoder"
)
def test_chat_template_audio_from_video(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest(f"{self.processor_class} does not suport video inputs")
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "Which of these animals is making the sound?"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "It is a cow."}],
},
{
"role": "user",
"content": [
{"type": "text", "text": "Tell me all about this animal."},
],
},
]
formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1) # batch size=1
out_dict = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="np",
load_audio_from_video=True,
)
self.assertTrue(self.audio_input_name in out_dict)
self.assertTrue(self.videos_input_name in out_dict)
# should always have input_ids and attention_mask
self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1
self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1
self.assertEqual(len(out_dict[self.audio_input_name]), 1) # 1 audio in the conversation
self.assertEqual(len(out_dict[self.videos_input_name]), 145912) # 1 video in the conversation

View File

@@ -719,11 +719,9 @@ class ProcessorTesterMixin:
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
tokenizer = self.get_component("tokenizer")
processor_components = self.prepare_components()
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
@@ -1128,11 +1126,7 @@ class ProcessorTesterMixin:
{
"role": "user",
"content": [
{
"type": "audio",
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
},
{"type": "text", "text": "Is it the same sound?"},
{"type": "text", "text": "Tell me all about this animal."},
],
},
]
@@ -1154,5 +1148,5 @@ class ProcessorTesterMixin:
# should always have input_ids and attention_mask
self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1
self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1
self.assertEqual(len(out_dict[self.audio_input_name]), 2) # 2 audios in the conversation
self.assertEqual(len(out_dict[self.audio_input_name]), 1) # 1 audio in the conversation
self.assertEqual(len(out_dict[self.videos_input_name]), 1) # 1 video in the conversation