From 9bcdd5cde9411477cba66bc9e6d1c59e80b60b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Ouazan?= <83456801+remi-or@users.noreply.github.com> Date: Fri, 20 Jun 2025 11:22:32 +0200 Subject: [PATCH] Modernbert fixes (#38912) * Removed deprecated argument in modernbert RotaryEmbedding * Skip test_sdpa_can_dispatch_on_flash for modernbert --------- Co-authored-by: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --- src/transformers/models/modernbert/modeling_modernbert.py | 2 +- src/transformers/models/modernbert/modular_modernbert.py | 2 +- tests/test_modeling_common.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py index d984c523d0..05fb1af62b 100644 --- a/src/transformers/models/modernbert/modeling_modernbert.py +++ b/src/transformers/models/modernbert/modeling_modernbert.py @@ -154,7 +154,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding): up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ, the cos_sin_cache will be recomputed during the forward pass. """ - super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False) + super().__init__(dim=dim, base=base, device=device, interleaved=False) self.max_seqlen = max_seqlen if max_seqlen is not None and device is not None and dtype is not None: diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index ff46a523a6..a707c659fb 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -417,7 +417,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding): up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ, the cos_sin_cache will be recomputed during the forward pass. """ - super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False) + super().__init__(dim=dim, base=base, device=device, interleaved=False) self.max_seqlen = max_seqlen if max_seqlen is not None and device is not None and dtype is not None: diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 4c7cef05c3..4e2555b57e 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3795,6 +3795,10 @@ class ModelTesterMixin: self.skipTest( "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input" ) + if config.model_type in ["modernbert"]: + self.skipTest( + reason="ModernBert currently (transformers==4.52.0) automatically adds an attention_mask input" + ) if config.model_type in ["idefics", "idefics2", "idefics3"]: self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input") if config.model_type in ["sam"]: