Add Qwen2.5-Omni (#36752)
* Add qwen2.5-omni * Remove einops dependency * Add torchdiffeq dependency * Sort init * Add torchdiffeq to extras['diffeq'] * Fix repo consistency * use cached_file * del odeint * renew pytest * format * Remove torchdiffeq * format * fixed batch infer bug * Change positional_embedding to parameter * Change default speaker * Config revision * Use modular & code clean * code clean * decouple padding with model & code cleaning * sort init * fix * fix * Second code review * fix * fix * rename vars to full name + some comments * update pytest * Code clean & fix * fix * style * more clean up * fixup * smaller vision model in tests * fix processor test * deflake a bit the tests (still flaky though) * de-flake tests finally + add generation mixin * final nits i hope * make sure processor tests are complete * replace with Qwen2_5OmniForConditionalGeneration * fix tests after updating ckpt * fix typos when cleaning, also we can't change ckpt * fixup * images and videos kwargs for processor * thinker and talker loadable from hub ckpt * address comments and update tests after rebase * fixup * skip for now * fixup * fixup * remove torch dependency in processors --------- Co-authored-by: lvyuanjun.lyj <lvyuanjun.lyj@alibaba-inc.con> Co-authored-by: feizi.wx <feizi.wx@alibaba-inc.com> Co-authored-by: raushan <raushan@huggingface.co>
This commit is contained in:
@@ -719,11 +719,9 @@ class ProcessorTesterMixin:
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
|
||||
feature_extractor = self.get_component("feature_extractor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
processor_components = self.prepare_components()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
|
||||
@@ -1128,11 +1126,7 @@ class ProcessorTesterMixin:
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio",
|
||||
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
|
||||
},
|
||||
{"type": "text", "text": "Is it the same sound?"},
|
||||
{"type": "text", "text": "Tell me all about this animal."},
|
||||
],
|
||||
},
|
||||
]
|
||||
@@ -1154,5 +1148,5 @@ class ProcessorTesterMixin:
|
||||
# should always have input_ids and attention_mask
|
||||
self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1
|
||||
self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1
|
||||
self.assertEqual(len(out_dict[self.audio_input_name]), 2) # 2 audios in the conversation
|
||||
self.assertEqual(len(out_dict[self.audio_input_name]), 1) # 1 audio in the conversation
|
||||
self.assertEqual(len(out_dict[self.videos_input_name]), 1) # 1 video in the conversation
|
||||
|
||||
Reference in New Issue
Block a user