Add Qwen2.5-Omni (#36752)

* Add qwen2.5-omni * Remove einops dependency * Add torchdiffeq dependency * Sort init * Add torchdiffeq to extras['diffeq'] * Fix repo consistency * use cached_file * del odeint * renew pytest * format * Remove torchdiffeq * format * fixed batch infer bug * Change positional_embedding to parameter * Change default speaker * Config revision * Use modular & code clean * code clean * decouple padding with model & code cleaning * sort init * fix * fix * Second code review * fix * fix * rename vars to full name + some comments * update pytest * Code clean & fix * fix * style * more clean up * fixup * smaller vision model in tests * fix processor test * deflake a bit the tests (still flaky though) * de-flake tests finally + add generation mixin * final nits i hope * make sure processor tests are complete * replace with Qwen2_5OmniForConditionalGeneration * fix tests after updating ckpt * fix typos when cleaning, also we can't change ckpt * fixup * images and videos kwargs for processor * thinker and talker loadable from hub ckpt * address comments and update tests after rebase * fixup * skip for now * fixup * fixup * remove torch dependency in processors --------- Co-authored-by: lvyuanjun.lyj <lvyuanjun.lyj@alibaba-inc.con> Co-authored-by: feizi.wx <feizi.wx@alibaba-inc.com> Co-authored-by: raushan <raushan@huggingface.co>
2025-04-14 18:36:41 +08:00
parent ac1df5fccd
commit 4b8c6d4cf8
35 changed files with 12063 additions and 49 deletions
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -143,6 +143,13 @@ IGNORE_NON_TESTED = (
        "ChameleonVQVAE",  # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model
        "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.
        "Qwen2_5_VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5_VLForConditionalGeneration.
+        "Qwen2_5OmniForConditionalGeneration",  # Not a regular model. Testted in Qwen2_5OmniModelIntergrationTest
+        "Qwen2_5OmniTalkerForConditionalGeneration",  #  Building part of bigger (tested) model. Tested implicitly through Qwen2_5OmniModelIntergrationTest.
+        "Qwen2_5OmniTalkerModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5OmniModelIntergrationTest.
+        "Qwen2_5OmniThinkerTextModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5OmniModelIntergrationTest.
+        "Qwen2_5OmniToken2WavModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5OmniModelIntergrationTest.
+        "Qwen2_5OmniToken2WavDiTModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5OmniModelIntergrationTest.
+        "Qwen2_5OmniToken2WavBigVGANModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5OmniModelIntergrationTest.
        "MllamaTextModel",  # Building part of bigger (tested) model. # TODO: add tests
        "MllamaVisionModel",  # Building part of bigger (tested) model. # TODO: add tests
        "Llama4TextModel",  # Building part of bigger (tested) model. # TODO: add tests
@@ -348,6 +355,13 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "MoshiForConditionalGeneration",  # no auto class for speech-to-speech
    "Emu3VQVAE",  # no autoclass for VQ-VAE models
    "Emu3TextModel",  # Building part of bigger (tested) model
+    "Qwen2_5OmniTalkerForConditionalGeneration",  # Building part of a bigger model
+    "Qwen2_5OmniTalkerModel",  # Building part of a bigger model
+    "Qwen2_5OmniThinkerForConditionalGeneration",  # Building part of a bigger model
+    "Qwen2_5OmniThinkerTextModel",  # Building part of a bigger model
+    "Qwen2_5OmniToken2WavModel",  # Building part of a bigger model
+    "Qwen2_5OmniToken2WavBigVGANModel",  # Building part of a bigger model
+    "Qwen2_5OmniToken2WavDiTModel",  # Building part of a bigger model
 ]

 # DO NOT edit this list!