From 7ff896c0f22227ae999f991cff500e70452d5dcc Mon Sep 17 00:00:00 2001 From: duanjunwen <935724073@qq.com> Date: Fri, 11 Apr 2025 01:00:58 +0800 Subject: [PATCH] [Feat] Support npu in modeling models (#37369) --- src/transformers/models/aria/modeling_aria.py | 2 +- src/transformers/models/bamba/modeling_bamba.py | 2 +- src/transformers/models/bamba/modular_bamba.py | 2 +- src/transformers/models/bloom/modeling_bloom.py | 2 +- src/transformers/models/chameleon/modeling_chameleon.py | 2 +- src/transformers/models/codegen/modeling_codegen.py | 2 +- src/transformers/models/cohere/modeling_cohere.py | 2 +- src/transformers/models/dbrx/modeling_dbrx.py | 2 +- src/transformers/models/deepseek_v3/modeling_deepseek_v3.py | 2 +- src/transformers/models/diffllama/modeling_diffllama.py | 2 +- src/transformers/models/emu3/modeling_emu3.py | 2 +- src/transformers/models/falcon/modeling_falcon.py | 2 +- src/transformers/models/gemma/modeling_gemma.py | 2 +- src/transformers/models/glm/modeling_glm.py | 2 +- src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py | 6 +++++- src/transformers/models/gpt_neo/modeling_gpt_neo.py | 2 +- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 2 +- .../models/gpt_neox_japanese/modeling_gpt_neox_japanese.py | 2 +- src/transformers/models/gptj/modeling_gptj.py | 2 +- src/transformers/models/granite/modeling_granite.py | 2 +- src/transformers/models/granitemoe/modeling_granitemoe.py | 2 +- .../models/granitemoeshared/modeling_granitemoeshared.py | 2 +- src/transformers/models/helium/modeling_helium.py | 2 +- src/transformers/models/idefics/modeling_idefics.py | 2 +- src/transformers/models/jamba/modeling_jamba.py | 2 +- src/transformers/models/jetmoe/modeling_jetmoe.py | 2 +- src/transformers/models/llama/modeling_llama.py | 2 +- src/transformers/models/llama4/modeling_llama4.py | 2 +- src/transformers/models/longt5/modeling_longt5.py | 2 +- src/transformers/models/mimi/modeling_mimi.py | 2 +- src/transformers/models/mistral/modeling_mistral.py | 2 +- src/transformers/models/mistral/modular_mistral.py | 2 +- src/transformers/models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/mllama/modeling_mllama.py | 2 +- src/transformers/models/moonshine/modeling_moonshine.py | 2 +- src/transformers/models/moshi/modeling_moshi.py | 4 ++-- src/transformers/models/mt5/modeling_mt5.py | 2 +- src/transformers/models/nemotron/modeling_nemotron.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 2 +- src/transformers/models/olmo2/modeling_olmo2.py | 2 +- src/transformers/models/olmoe/modeling_olmoe.py | 2 +- src/transformers/models/opt/modeling_opt.py | 2 +- src/transformers/models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../models/phi4_multimodal/modeling_phi4_multimodal.py | 2 +- src/transformers/models/phimoe/modeling_phimoe.py | 2 +- src/transformers/models/pix2struct/modeling_pix2struct.py | 2 +- src/transformers/models/pop2piano/modeling_pop2piano.py | 2 +- src/transformers/models/qwen2/modeling_qwen2.py | 2 +- src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py | 2 +- src/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 2 +- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 2 +- src/transformers/models/qwen3/modeling_qwen3.py | 2 +- src/transformers/models/qwen3_moe/modeling_qwen3_moe.py | 2 +- .../models/recurrent_gemma/modeling_recurrent_gemma.py | 2 +- src/transformers/models/stablelm/modeling_stablelm.py | 2 +- src/transformers/models/starcoder2/modeling_starcoder2.py | 2 +- .../switch_transformers/modeling_switch_transformers.py | 2 +- src/transformers/models/t5/modeling_t5.py | 2 +- src/transformers/models/udop/modeling_udop.py | 2 +- src/transformers/models/umt5/modeling_umt5.py | 2 +- src/transformers/models/whisper/modeling_whisper.py | 2 +- src/transformers/models/zamba/modeling_zamba.py | 2 +- src/transformers/models/zamba2/modeling_zamba2.py | 2 +- 65 files changed, 70 insertions(+), 66 deletions(-) diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 19997e2a9b..d64e2746d4 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -1040,7 +1040,7 @@ class AriaTextModel(AriaTextPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 7c084d37ed..0cf23edb75 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -1331,7 +1331,7 @@ class BambaModel(BambaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index b6cdf90774..5aa4c8fb40 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -1098,7 +1098,7 @@ class BambaModel(BambaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index f91cd85779..48f89810bd 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -799,7 +799,7 @@ class BloomModel(BloomPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 65ace7cbcc..4d950ec668 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1441,7 +1441,7 @@ class ChameleonModel(ChameleonPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 1d7d3b6d27..4f06094567 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -645,7 +645,7 @@ class CodeGenModel(CodeGenPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 2083bb63ab..fd888c38d7 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -689,7 +689,7 @@ class CohereModel(CoherePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 258023c60b..9f89398685 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1171,7 +1171,7 @@ class DbrxModel(DbrxPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py index e7ad5c5cf0..c1a020b7c7 100644 --- a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py @@ -834,7 +834,7 @@ class DeepseekV3Model(DeepseekV3PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index e8031be755..ed536cbeba 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -931,7 +931,7 @@ class DiffLlamaModel(DiffLlamaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 341944a236..4646b9f9bd 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1518,7 +1518,7 @@ class Emu3TextModel(Emu3PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index ee76f7cbb0..82a4dfee08 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1060,7 +1060,7 @@ class FalconModel(FalconPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 99e65dbae9..679bc08698 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -656,7 +656,7 @@ class GemmaModel(GemmaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index f2acc45c66..2a41815d74 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -670,7 +670,7 @@ class GlmModel(GlmPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index d9a84d6f46..857735838f 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -926,7 +926,11 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel): # [batch_size, target_length, 1, source_length], not compatible with SDPA, hence this transpose. self_attention_mask = self_attention_mask.transpose(1, 2) - if query_length > 1 and attention_mask is not None and attention_mask.device.type in ["cuda", "xpu"]: + if ( + query_length > 1 + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + ): # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213 self_attention_mask = AttentionMaskConverter._unmask_unattended( diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 9cc18c0ea9..9151590b92 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -848,7 +848,7 @@ class GPTNeoModel(GPTNeoPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 86ad00d3c2..5a0bc1af2f 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -657,7 +657,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index 83955ff050..e08624bf81 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -694,7 +694,7 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index fca8b0e5a1..adfe6c584b 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -947,7 +947,7 @@ class GPTJModel(GPTJPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 4160a658f8..bb06757fb8 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -671,7 +671,7 @@ class GraniteModel(GranitePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 3b71ca4da6..9ea68ab0aa 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -1147,7 +1147,7 @@ class GraniteMoeModel(GraniteMoePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 81644fc3c3..886bed0968 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -1092,7 +1092,7 @@ class GraniteMoeSharedModel(GraniteMoeSharedPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index 599c8a5667..2597ce27fa 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -657,7 +657,7 @@ class HeliumModel(HeliumPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 02d5fb3c05..fb5006d2e2 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1423,7 +1423,7 @@ class IdeficsModel(IdeficsPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 9a797c81d9..abf4ad4b3d 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1372,7 +1372,7 @@ class JambaModel(JambaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 7a4b6b36f1..70cb8126dc 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1151,7 +1151,7 @@ class JetMoeModel(JetMoePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 1a598f21a3..e8dd139526 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -661,7 +661,7 @@ class LlamaModel(LlamaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index 7fd98deb99..4363001f4d 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -810,7 +810,7 @@ class Llama4TextModel(Llama4PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and attention_mask.ndim == 4 and not output_attentions # Only unmask for 4d masks ): diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index d076926406..da8bdca3e7 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1657,7 +1657,7 @@ class LongT5Stack(LongT5PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 6313ff4b65..f79fea53c1 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1105,7 +1105,7 @@ class MimiTransformerModel(nn.Module): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 72a9c88eac..b235c74413 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -638,7 +638,7 @@ class MistralModel(MistralPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/mistral/modular_mistral.py b/src/transformers/models/mistral/modular_mistral.py index 4f36181cd7..9aae2e5505 100644 --- a/src/transformers/models/mistral/modular_mistral.py +++ b/src/transformers/models/mistral/modular_mistral.py @@ -186,7 +186,7 @@ class MistralModel(LlamaModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index cb4e0a9c36..d6da7b1d78 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -766,7 +766,7 @@ class MixtralModel(MixtralPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 48e863d686..64329c2abd 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1108,7 +1108,7 @@ class MllamaPreTrainedModel(PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 6e470f7bf7..c9e6eb756c 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -1014,7 +1014,7 @@ class MoonshineDecoder(MoonshinePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 0cf575187f..d77d32edf7 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1335,7 +1335,7 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when @@ -1649,7 +1649,7 @@ class MoshiModel(MoshiPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index f8fbc8e343..0199c7d779 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -1248,7 +1248,7 @@ class MT5Stack(MT5PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index b29b4bf0ad..31d14fb5da 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -906,7 +906,7 @@ class NemotronModel(NemotronPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 8b015057ef..dc09278fe2 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -634,7 +634,7 @@ class OlmoModel(OlmoPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py index c99ba1f0de..bd0a47eaf1 100644 --- a/src/transformers/models/olmo2/modeling_olmo2.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -637,7 +637,7 @@ class Olmo2Model(Olmo2PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index f6d21fb05e..ce38e57cfd 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1056,7 +1056,7 @@ class OlmoeModel(OlmoePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 01639f5b66..5a1f15f94c 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -695,7 +695,7 @@ class OPTDecoder(OPTPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 508e0c8073..029e376eaf 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -706,7 +706,7 @@ class PersimmonModel(PersimmonPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 51e518ffc2..664cb571ad 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -627,7 +627,7 @@ class PhiModel(PhiPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 0b0d7b626b..e777e5a28b 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -692,7 +692,7 @@ class Phi3Model(Phi3PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index 68a9ae5536..112dca6e6d 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -1986,7 +1986,7 @@ class Phi4MultimodalModel(Phi4MultimodalPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 9cbfe776bb..7e94c27cf9 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1240,7 +1240,7 @@ class PhimoeModel(PhimoePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 675859f310..da65dbc369 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1644,7 +1644,7 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index 73ff1b9d7b..18f6659dc0 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -1057,7 +1057,7 @@ class Pop2PianoStack(Pop2PianoPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 661e3181d7..c3d735e8c4 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -651,7 +651,7 @@ class Qwen2Model(Qwen2PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index e80f9f0c27..ee963a7e27 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1277,7 +1277,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 930899a0c9..42fbdd9c57 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1101,7 +1101,7 @@ class Qwen2MoeModel(Qwen2MoePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 10d8c60a56..e4593a2b4f 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1231,7 +1231,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py index dbf31b6d52..ea04ded17b 100644 --- a/src/transformers/models/qwen3/modeling_qwen3.py +++ b/src/transformers/models/qwen3/modeling_qwen3.py @@ -678,7 +678,7 @@ class Qwen3Model(Qwen3PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index e56b121b3a..2052fc39f8 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -780,7 +780,7 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index 450da13493..3bf5b43706 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -764,7 +764,7 @@ class RecurrentGemmaModel(RecurrentGemmaPreTrainedModel): padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) - if attention_mask is not None and attention_mask.device.type in ["cuda", "xpu"]: + if attention_mask is not None and attention_mask.device.type in ["cuda", "xpu", "npu"]: # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. # Details: https://github.com/pytorch/pytorch/issues/110213 diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 0d824c568c..69d2ff3c0d 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -960,7 +960,7 @@ class StableLmModel(StableLmPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 4ca5ecf636..3d78926561 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -626,7 +626,7 @@ class Starcoder2Model(Starcoder2PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index d7a158e9e8..4a2696a610 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -1191,7 +1191,7 @@ class SwitchTransformersStack(SwitchTransformersPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 306944bae1..8e3cd8d965 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1262,7 +1262,7 @@ class T5Stack(T5PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 4f355c9ed5..5a3ce836d4 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1594,7 +1594,7 @@ class UdopStack(UdopPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index a25a99653a..37015121cf 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -905,7 +905,7 @@ class UMT5Stack(UMT5PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 42fb928b7a..dec1e03937 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1428,7 +1428,7 @@ class WhisperDecoder(WhisperPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] and not output_attentions ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 3018d3aaf6..27d75129e8 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -1167,7 +1167,7 @@ class ZambaModel(ZambaPreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index fb3f9b481f..843420d7db 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -1529,7 +1529,7 @@ class Zamba2Model(Zamba2PreTrainedModel): if ( self.config._attn_implementation == "sdpa" and attention_mask is not None - and attention_mask.device.type in ["cuda", "xpu"] + and attention_mask.device.type in ["cuda", "xpu", "npu"] ): # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.