Add Qwen2.5-Omni (#36752)

* Add qwen2.5-omni

* Remove einops dependency

* Add torchdiffeq dependency

* Sort init

* Add torchdiffeq to extras['diffeq']

* Fix repo consistency

* use cached_file

* del odeint

* renew pytest

* format

* Remove torchdiffeq

* format

* fixed batch infer bug

* Change positional_embedding to parameter

* Change default speaker

* Config revision

* Use modular & code clean

* code clean

* decouple padding with model & code cleaning

* sort init

* fix

* fix

* Second code review

* fix

* fix

* rename vars to full name + some comments

* update pytest

* Code clean & fix

* fix

* style

* more clean up

* fixup

* smaller vision model in tests

* fix processor test

* deflake a bit the tests (still flaky though)

* de-flake tests finally + add generation mixin

* final nits i hope

* make sure processor tests are complete

* replace with Qwen2_5OmniForConditionalGeneration

* fix tests after updating ckpt

* fix typos when cleaning, also we can't change ckpt

* fixup

* images and videos kwargs for processor

* thinker and talker loadable from hub ckpt

* address comments and update tests after rebase

* fixup

* skip for now

* fixup

* fixup

* remove torch dependency in processors

---------

Co-authored-by: lvyuanjun.lyj <lvyuanjun.lyj@alibaba-inc.con>
Co-authored-by: feizi.wx <feizi.wx@alibaba-inc.com>
Co-authored-by: raushan <raushan@huggingface.co>
This commit is contained in:
BakerBunker
2025-04-14 18:36:41 +08:00
committed by GitHub
parent ac1df5fccd
commit 4b8c6d4cf8
35 changed files with 12063 additions and 49 deletions

View File

@@ -719,11 +719,9 @@ class ProcessorTesterMixin:
if "feature_extractor" not in self.processor_class.attributes:
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
feature_extractor = self.get_component("feature_extractor")
tokenizer = self.get_component("tokenizer")
processor_components = self.prepare_components()
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
@@ -1128,11 +1126,7 @@ class ProcessorTesterMixin:
{
"role": "user",
"content": [
{
"type": "audio",
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
},
{"type": "text", "text": "Is it the same sound?"},
{"type": "text", "text": "Tell me all about this animal."},
],
},
]
@@ -1154,5 +1148,5 @@ class ProcessorTesterMixin:
# should always have input_ids and attention_mask
self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1
self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1
self.assertEqual(len(out_dict[self.audio_input_name]), 2) # 2 audios in the conversation
self.assertEqual(len(out_dict[self.audio_input_name]), 1) # 1 audio in the conversation
self.assertEqual(len(out_dict[self.videos_input_name]), 1) # 1 video in the conversation