Modernbert fixes (#38912)

* Removed deprecated argument in modernbert RotaryEmbedding

* Skip test_sdpa_can_dispatch_on_flash for modernbert

---------

Co-authored-by: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
This commit is contained in:
Rémi Ouazan
2025-06-20 11:22:32 +02:00
committed by GitHub
parent 31d30b7224
commit 9bcdd5cde9
3 changed files with 6 additions and 2 deletions

View File

@@ -154,7 +154,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
the cos_sin_cache will be recomputed during the forward pass.
"""
super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
super().__init__(dim=dim, base=base, device=device, interleaved=False)
self.max_seqlen = max_seqlen
if max_seqlen is not None and device is not None and dtype is not None:

View File

@@ -417,7 +417,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
the cos_sin_cache will be recomputed during the forward pass.
"""
super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
super().__init__(dim=dim, base=base, device=device, interleaved=False)
self.max_seqlen = max_seqlen
if max_seqlen is not None and device is not None and dtype is not None:

View File

@@ -3795,6 +3795,10 @@ class ModelTesterMixin:
self.skipTest(
"PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
)
if config.model_type in ["modernbert"]:
self.skipTest(
reason="ModernBert currently (transformers==4.52.0) automatically adds an attention_mask input"
)
if config.model_type in ["idefics", "idefics2", "idefics3"]:
self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
if config.model_type in ["sam"]: