From 686bb3b098910adb28df3a8c65e2dbe9567ca32e Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Mon, 28 Jul 2025 15:43:41 +0200 Subject: [PATCH] Remove all expired deprecation cycles (#39725) * remove all deprecation cycles * style * fix * remove * remove * fix * Update modular_dpt.py * back * typo * typo * final fix * remove all args --- .../generation/candidate_generator.py | 14 +------ .../generation/configuration_utils.py | 2 - .../models/align/modeling_align.py | 27 ------------- .../models/altclip/modeling_altclip.py | 39 ------------------- .../models/beit/image_processing_beit.py | 15 +------ .../models/beit/image_processing_beit_fast.py | 12 +----- src/transformers/models/bert/modeling_bert.py | 20 ---------- .../modeling_bert_generation.py | 12 ------ .../models/blip_2/modeling_blip_2.py | 16 -------- .../bridgetower/modeling_bridgetower.py | 12 ------ src/transformers/models/bros/modeling_bros.py | 15 ------- .../models/camembert/modeling_camembert.py | 20 ---------- .../chinese_clip/modeling_chinese_clip.py | 27 ------------- src/transformers/models/clap/modeling_clap.py | 35 ----------------- .../models/cohere2/modeling_cohere2.py | 2 - .../models/cohere2/modular_cohere2.py | 2 - .../data2vec/modeling_data2vec_audio.py | 2 - .../models/data2vec/modeling_data2vec_text.py | 12 ------ .../modeling_decision_transformer.py | 3 -- src/transformers/models/dpt/modular_dpt.py | 3 -- .../models/electra/modeling_electra.py | 12 ------ .../models/ernie/modeling_ernie.py | 12 ------ src/transformers/models/esm/modeling_esm.py | 17 -------- .../models/evolla/modeling_evolla.py | 13 ------- .../models/gemma2/modeling_gemma2.py | 2 - .../models/gemma2/modular_gemma2.py | 2 - .../models/gemma3/modeling_gemma3.py | 2 - .../models/gemma3/modular_gemma3.py | 2 - .../models/gemma3n/modeling_gemma3n.py | 2 - src/transformers/models/gpt2/modeling_gpt2.py | 3 -- .../processing_grounding_dino.py | 2 - .../models/hubert/modeling_hubert.py | 2 - .../instructblip/modeling_instructblip.py | 16 -------- .../modeling_instructblipvideo.py | 16 -------- .../models/layoutlm/modeling_layoutlm.py | 35 ----------------- .../models/markuplm/modeling_markuplm.py | 27 ------------- .../image_processing_mask2former.py | 21 ---------- .../image_processing_mask2former_fast.py | 19 --------- .../maskformer/image_processing_maskformer.py | 21 ---------- .../image_processing_maskformer_fast.py | 19 --------- .../megatron_bert/modeling_megatron_bert.py | 9 ----- .../omdet_turbo/processing_omdet_turbo.py | 3 -- .../oneformer/image_processing_oneformer.py | 14 ------- .../patchtsmixer/modeling_patchtsmixer.py | 2 - .../models/patchtst/modeling_patchtst.py | 2 - .../qwen2_audio/processing_qwen2_audio.py | 14 ------- .../models/rembert/modeling_rembert.py | 12 ------ .../models/roberta/modeling_roberta.py | 20 ---------- .../modeling_roberta_prelayernorm.py | 12 ------ .../models/roc_bert/modeling_roc_bert.py | 12 ------ .../segformer/image_processing_segformer.py | 15 +------ src/transformers/models/sew/modeling_sew.py | 2 - .../models/splinter/modeling_splinter.py | 35 ----------------- .../models/tapas/modeling_tapas.py | 12 ------ .../models/unispeech/modeling_unispeech.py | 2 - .../unispeech_sat/modeling_unispeech_sat.py | 2 - .../models/wav2vec2/modeling_wav2vec2.py | 2 - .../xlm_roberta/modeling_xlm_roberta.py | 20 ---------- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 20 ---------- src/transformers/models/xmod/modeling_xmod.py | 12 ------ .../test_image_processing_mask2former.py | 26 ------------- .../test_image_processing_maskformer.py | 25 ------------ .../test_image_processing_oneformer.py | 14 ------- .../test_image_processing_segformer.py | 13 ------- 64 files changed, 4 insertions(+), 831 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 9ee097bba2..b62d6e7ed1 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -37,8 +37,6 @@ if TYPE_CHECKING: from ..tokenization_utils_base import PreTrainedTokenizerBase from .configuration_utils import GenerationConfig -from ..utils.deprecation import deprecate_kwarg - class CandidateGenerator: """Abstract base class for all candidate generators that can be applied during assisted generation.""" @@ -685,9 +683,6 @@ class AssistantToTargetTranslator: The tokenizer used by the assistant model. target_vocab_size (`int`): The size of the target model's vocabulary. If not provided, will be inferred from the target tokenizer. - assistant_model_device (str, optional): The device on which the assistant model is loaded. - Defaults to "cpu". - assistant_model_device (`str`, defaults to "cpu"): The device where the assistant model is located. Used for placing tensors. assistant_model (Optional[PreTrainedModel], optional): The assistant model to be used. Defaults to None for backward compatibility. assistant_prune_lm_head (bool): Whether to prune the assistant model's language model head to match the target vocabulary. This is only applicable if `assistant_model` is provided. @@ -697,21 +692,17 @@ class AssistantToTargetTranslator: FILTER_VALUE: float = -float("Inf") # The value used to filter out unmapped tokens in the logits. SUPPRESS_TOKEN_ID: int = -1 # The ID used to mark suppressed tokens in the mapping. - @deprecate_kwarg("assistant_model_device", version="4.53") def __init__( self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", target_vocab_size: int, # required since target_vocab_size can be different from the length of target_tokenizer.get_vocab() - assistant_model_device: str = "cpu", assistant_model: Optional["PreTrainedModel"] = None, assistant_prune_lm_head: bool = False, ): self._target_tokenizer: PreTrainedTokenizerBase = target_tokenizer self._assistant_tokenizer: PreTrainedTokenizerBase = assistant_tokenizer - self._assistant_model_device: str = ( - assistant_model_device if assistant_model is None else assistant_model.device - ) + self._assistant_model_device = assistant_model.device if assistant_model is not None else "cpu" self.target_vocab_size: int = target_vocab_size self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = ( self._get_assistant_to_target_input_ids() @@ -845,13 +836,11 @@ class AssistantVocabTranslatorCache: _cache = weakref.WeakKeyDictionary() @classmethod - @deprecate_kwarg("assistant_model_device", version="4.53") def get_translator( cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", target_vocab_size: int, - assistant_model_device: str = "cpu", assistant_model: Optional["PreTrainedModel"] = None, assistant_prune_lm_head: bool = False, ) -> AssistantToTargetTranslator: @@ -866,7 +855,6 @@ class AssistantVocabTranslatorCache: target_tokenizer, assistant_tokenizer, target_vocab_size, - assistant_model_device, assistant_model, assistant_prune_lm_head, ) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 84b892251d..697046b3ed 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -35,7 +35,6 @@ from ..utils import ( is_torch_available, logging, ) -from ..utils.deprecation import deprecate_kwarg if TYPE_CHECKING: @@ -576,7 +575,6 @@ class GenerationConfig(PushToHubMixin): ) return generation_mode - @deprecate_kwarg("is_init", version="4.54.0") def validate(self, strict=False): """ Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 455cab2f77..24cc3639ce 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -33,7 +33,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_align import AlignConfig, AlignTextConfig, AlignVisionConfig @@ -621,17 +620,11 @@ class AlignTextSelfAttention(nn.Module): self.attention_dropout = config.attention_probs_dropout_prob self.scaling = self.attention_head_size**-0.5 - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -703,17 +696,11 @@ class AlignTextAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -769,17 +756,11 @@ class AlignTextLayer(GradientCheckpointingLayer): self.intermediate = AlignTextIntermediate(config) self.output = AlignTextOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -813,20 +794,12 @@ class AlignTextEncoder(nn.Module): self.layer = nn.ModuleList([AlignTextLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 296d8a5d80..6581e2f18c 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -33,7 +33,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig @@ -205,17 +204,11 @@ class AltRobertaSelfAttention(nn.Module): self.max_position_embeddings = config.max_position_embeddings self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -319,17 +312,11 @@ class AltRobertaAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: self_outputs = self.self( @@ -384,17 +371,11 @@ class AltRobertaLayer(GradientCheckpointingLayer): self.intermediate = AltRobertaIntermediate(config) self.output = AltRobertaOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -429,20 +410,12 @@ class AltRobertaEncoder(nn.Module): self.layer = nn.ModuleList([AltRobertaLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, @@ -1036,10 +1009,6 @@ class AltRobertaModel(AltCLIPPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @auto_docstring # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( @@ -1050,10 +1019,6 @@ class AltRobertaModel(AltCLIPPreTrainedModel): position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.FloatTensor]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -1139,8 +1104,6 @@ class AltCLIPTextModel(AltCLIPPreTrainedModel): def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: return super().resize_token_embeddings(new_num_tokens) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -1151,8 +1114,6 @@ class AltCLIPTextModel(AltCLIPPreTrainedModel): position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, output_hidden_states: Optional[bool] = None, diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 95e8ec54f1..4f549bcff6 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -14,7 +14,7 @@ # limitations under the License. """Image processor class for Beit.""" -from typing import Any, Optional, Union +from typing import Optional, Union import numpy as np @@ -41,7 +41,6 @@ from ...utils import ( is_vision_available, logging, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.import_utils import requires @@ -101,7 +100,6 @@ class BeitImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0") @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS) def __init__( self, @@ -135,16 +133,6 @@ class BeitImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.do_reduce_labels = do_reduce_labels - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs - """ - image_processor_dict = image_processor_dict.copy() - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - def resize( self, image: np.ndarray, @@ -313,7 +301,6 @@ class BeitImageProcessor(BaseImageProcessor): # be passed in as positional arguments. return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0") @filter_out_non_signature_kwargs() def preprocess( self, diff --git a/src/transformers/models/beit/image_processing_beit_fast.py b/src/transformers/models/beit/image_processing_beit_fast.py index f2b94f3836..3b7b7efd7a 100644 --- a/src/transformers/models/beit/image_processing_beit_fast.py +++ b/src/transformers/models/beit/image_processing_beit_fast.py @@ -14,7 +14,7 @@ # limitations under the License. """Fast Image processor class for Beit.""" -from typing import Any, Optional, Union +from typing import Optional, Union import torch from torchvision.transforms import functional as F @@ -68,16 +68,6 @@ class BeitImageProcessorFast(BaseImageProcessorFast): def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]): super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs - """ - image_processor_dict = image_processor_dict.copy() - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - def reduce_label(self, labels: list["torch.Tensor"]): for idx in range(len(labels)): label = labels[idx] diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index c92f02a5f8..97387cdfc0 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -46,7 +46,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, get_torch_version, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_bert import BertConfig @@ -218,14 +217,12 @@ class BertSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -236,13 +233,7 @@ class BertSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -336,14 +327,12 @@ class BertSdpaSelfAttention(BertSelfAttention): self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") # Adapted from BertSelfAttention - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -362,7 +351,6 @@ class BertSdpaSelfAttention(BertSelfAttention): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, @@ -374,12 +362,7 @@ class BertSdpaSelfAttention(BertSelfAttention): self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention - # mask needs to be such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - current_states = encoder_hidden_states if is_cross_attention else hidden_states if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): @@ -499,14 +482,12 @@ class BertAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -516,7 +497,6 @@ class BertAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 0a4d56c3f3..41f4b97221 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -29,7 +29,6 @@ from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, Causa from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_bert_generation import BertGenerationConfig @@ -80,14 +79,12 @@ class BertGenerationSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -98,13 +95,7 @@ class BertGenerationSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -226,14 +217,12 @@ class BertGenerationAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -243,7 +232,6 @@ class BertGenerationAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index b19ae2f8dc..d4f3f2373b 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -24,7 +24,6 @@ from torch import nn from torch.nn import CrossEntropyLoss from ...activations import ACT2FN -from ...cache_utils import Cache from ...generation import GenerationMixin from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import ( @@ -37,7 +36,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM from .configuration_blip_2 import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig @@ -642,7 +640,6 @@ class Blip2QFormerMultiHeadAttention(nn.Module): x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states, @@ -650,7 +647,6 @@ class Blip2QFormerMultiHeadAttention(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, ): # If this is instantiated as a cross-attention module, the keys @@ -767,7 +763,6 @@ class Blip2QFormerAttention(nn.Module): self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states: torch.Tensor, @@ -775,7 +770,6 @@ class Blip2QFormerAttention(nn.Module): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: self_outputs = self.attention( @@ -844,7 +838,6 @@ class Blip2QFormerLayer(GradientCheckpointingLayer): self.intermediate_query = Blip2QFormerIntermediate(config) self.output_query = Blip2QFormerOutput(config) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states, @@ -852,7 +845,6 @@ class Blip2QFormerLayer(GradientCheckpointingLayer): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, query_length=0, ): @@ -929,8 +921,6 @@ class Blip2QFormerEncoder(nn.Module): ) self.gradient_checkpointing = False - @deprecate_kwarg("past_key_value", version="4.55.0") - @deprecate_kwarg("use_cache", version="4.55.0") def forward( self, hidden_states, @@ -938,8 +928,6 @@ class Blip2QFormerEncoder(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_values=None, - use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, @@ -1119,8 +1107,6 @@ class Blip2QFormerModel(Blip2PreTrainedModel): extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask - @deprecate_kwarg("past_key_value", version="4.55.0") - @deprecate_kwarg("use_cache", version="4.55.0") @auto_docstring def forward( self, @@ -1130,8 +1116,6 @@ class Blip2QFormerModel(Blip2PreTrainedModel): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index d82e913299..644b8b0a3b 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -37,7 +37,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig @@ -430,14 +429,12 @@ class BridgeTowerSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -448,13 +445,7 @@ class BridgeTowerSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -576,14 +567,12 @@ class BridgeTowerAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -593,7 +582,6 @@ class BridgeTowerAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py index 68fa2185ff..f12b47081d 100755 --- a/src/transformers/models/bros/modeling_bros.py +++ b/src/transformers/models/bros/modeling_bros.py @@ -33,7 +33,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_bros import BrosConfig @@ -208,7 +207,6 @@ class BrosSelfAttention(nn.Module): self.is_decoder = config.is_decoder - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -217,7 +215,6 @@ class BrosSelfAttention(nn.Module): head_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[torch.Tensor] = False, ) -> tuple[torch.Tensor]: hidden_shape = (hidden_states.shape[0], -1, self.num_attention_heads, self.attention_head_size) @@ -336,7 +333,6 @@ class BrosAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -345,7 +341,6 @@ class BrosAttention(nn.Module): head_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: self_outputs = self.self( @@ -407,7 +402,6 @@ class BrosLayer(GradientCheckpointingLayer): self.intermediate = BrosIntermediate(config) self.output = BrosOutput(config) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -416,7 +410,6 @@ class BrosLayer(GradientCheckpointingLayer): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: self_attention_outputs = self.attention( @@ -477,8 +470,6 @@ class BrosEncoder(nn.Module): self.config = config self.layer = nn.ModuleList([BrosLayer(config) for _ in range(config.num_hidden_layers)]) - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, @@ -488,8 +479,6 @@ class BrosEncoder(nn.Module): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, @@ -638,8 +627,6 @@ class BrosModel(BrosPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -653,8 +640,6 @@ class BrosModel(BrosPreTrainedModel): inputs_embeds: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.FloatTensor]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index c0271d2f27..e729505186 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -42,7 +42,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, get_torch_version, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_camembert import CamembertConfig @@ -168,14 +167,12 @@ class CamembertSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -186,13 +183,7 @@ class CamembertSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -287,14 +278,12 @@ class CamembertSdpaSelfAttention(CamembertSelfAttention): self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") # Adapted from CamembertSelfAttention - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -313,7 +302,6 @@ class CamembertSdpaSelfAttention(CamembertSelfAttention): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, @@ -325,12 +313,7 @@ class CamembertSdpaSelfAttention(CamembertSelfAttention): self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention - # mask needs to be such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - current_states = encoder_hidden_states if is_cross_attention else hidden_states if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): @@ -452,14 +435,12 @@ class CamembertAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -469,7 +450,6 @@ class CamembertAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index cf134e22c6..6d2452242d 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -31,7 +31,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from .configuration_chinese_clip import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig @@ -288,17 +287,11 @@ class ChineseCLIPTextSelfAttention(nn.Module): self.attention_dropout = config.attention_probs_dropout_prob self.scaling = self.attention_head_size**-0.5 - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -371,17 +364,11 @@ class ChineseCLIPTextAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -509,17 +496,11 @@ class ChineseCLIPTextLayer(GradientCheckpointingLayer): self.intermediate = ChineseCLIPTextIntermediate(config) self.output = ChineseCLIPTextOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -667,20 +648,12 @@ class ChineseCLIPTextEncoder(nn.Module): self.layer = nn.ModuleList([ChineseCLIPTextLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 87b45900f4..cf52cff806 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -33,7 +33,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig @@ -1137,17 +1136,11 @@ class ClapTextSelfAttention(nn.Module): self.attention_dropout = config.attention_probs_dropout_prob self.scaling = self.attention_head_size**-0.5 - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -1220,17 +1213,11 @@ class ClapTextAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -1287,17 +1274,11 @@ class ClapTextLayer(GradientCheckpointingLayer): self.intermediate = ClapTextIntermediate(config) self.output = ClapTextOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -1332,20 +1313,12 @@ class ClapTextEncoder(nn.Module): self.layer = nn.ModuleList([ClapTextLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, @@ -1526,10 +1499,6 @@ class ClapTextModel(ClapPreTrainedModel): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -1540,10 +1509,6 @@ class ClapTextModel(ClapPreTrainedModel): position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.FloatTensor]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index 88c3afe607..4472251a9d 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -35,7 +35,6 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from .configuration_cohere2 import Cohere2Config @@ -264,7 +263,6 @@ class Cohere2DecoderLayer(GradientCheckpointingLayer): self.input_layernorm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) self.attention_type = config.layer_types[layer_idx] - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 369b2ecae3..f0fa8f12ac 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -28,7 +28,6 @@ from ...modeling_rope_utils import rope_config_validation from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging -from ...utils.deprecation import deprecate_kwarg from ..cohere.modeling_cohere import ( CohereAttention, CohereDecoderLayer, @@ -348,7 +347,6 @@ class Cohere2DecoderLayer(CohereDecoderLayer): super().__init__(config, layer_idx) self.attention_type = config.layer_types[layer_idx] - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 622e28b008..c29bf8f6c1 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -45,7 +45,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, is_peft_available, is_torch_flex_attn_available -from ...utils.deprecation import deprecate_kwarg from .configuration_data2vec_audio import Data2VecAudioConfig @@ -241,7 +240,6 @@ class Data2VecAudioAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 32afbfc320..8e0f308faf 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -39,7 +39,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_data2vec_text import Data2VecTextConfig @@ -168,14 +167,12 @@ class Data2VecTextSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -186,13 +183,7 @@ class Data2VecTextSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -329,14 +320,12 @@ class Data2VecTextAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -346,7 +335,6 @@ class Data2VecTextAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 6a492e937a..540b6136bb 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -34,7 +34,6 @@ from ...utils import ( auto_docstring, logging, ) -from ...utils.deprecation import deprecate_kwarg from .configuration_decision_transformer import DecisionTransformerConfig @@ -256,7 +255,6 @@ class DecisionTransformerGPT2Attention(nn.Module): return attn_output, attn_weights - @deprecate_kwarg("layer_past", new_name="past_key_value", version="4.53.0", raise_if_both_names=True) def forward( self, hidden_states: Optional[tuple[torch.FloatTensor]], @@ -380,7 +378,6 @@ class DecisionTransformerGPT2Block(GradientCheckpointingLayer): self.mlp = DecisionTransformerGPT2MLP(inner_dim, config) - @deprecate_kwarg("layer_past", new_name="past_key_value", version="4.53.0", raise_if_both_names=True) def forward( self, hidden_states: Optional[tuple[torch.FloatTensor]], diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py index 46cefe530f..584181701b 100644 --- a/src/transformers/models/dpt/modular_dpt.py +++ b/src/transformers/models/dpt/modular_dpt.py @@ -139,9 +139,6 @@ class DPTImageProcessorFast(BeitImageProcessorFast): valid_kwargs = DPTFastImageProcessorKwargs - def from_dict(): - raise NotImplementedError("No need to override this method") - def resize( self, image: "torch.Tensor", diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index ef7e8051e6..67dbe02c20 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -41,7 +41,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_electra import ElectraConfig @@ -225,14 +224,12 @@ class ElectraSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -243,13 +240,7 @@ class ElectraSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -386,14 +377,12 @@ class ElectraAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -403,7 +392,6 @@ class ElectraAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index d291db4e6b..f2aaa89723 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -42,7 +42,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_ernie import ErnieConfig @@ -154,14 +153,12 @@ class ErnieSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -172,13 +169,7 @@ class ErnieSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -315,14 +306,12 @@ class ErnieAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -332,7 +321,6 @@ class ErnieAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index 71772e4ffa..9acc3625bd 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -34,7 +34,6 @@ from ...modeling_outputs import ( ) from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_esm import EsmConfig @@ -287,7 +286,6 @@ class EsmSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -295,7 +293,6 @@ class EsmSelfAttention(nn.Module): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: hidden_shape = (hidden_states.shape[0], -1, self.num_attention_heads, self.attention_head_size) @@ -400,7 +397,6 @@ class EsmFlashAttention2(EsmSelfAttention): self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() self.dropout_prob = config.attention_probs_dropout_prob - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -408,7 +404,6 @@ class EsmFlashAttention2(EsmSelfAttention): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: # Flash attention doesn't support output_attentions or cross attention @@ -530,7 +525,6 @@ class EsmAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states, @@ -538,7 +532,6 @@ class EsmAttention(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, cache_position=None, ): @@ -596,7 +589,6 @@ class EsmLayer(GradientCheckpointingLayer): self.output = EsmOutput(config) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states, @@ -604,7 +596,6 @@ class EsmLayer(GradientCheckpointingLayer): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, cache_position=None, ): @@ -664,8 +655,6 @@ class EsmEncoder(nn.Module): self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.gradient_checkpointing = False - @deprecate_kwarg("past_key_value", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, @@ -674,8 +663,6 @@ class EsmEncoder(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_values=None, - use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, @@ -818,8 +805,6 @@ class EsmModel(EsmPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -831,8 +816,6 @@ class EsmModel(EsmPreTrainedModel): inputs_embeds: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.FloatTensor]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index 8f91f60056..d65653adbc 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -52,7 +52,6 @@ from ...modeling_utils import ( ) from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from .configuration_evolla import EvollaConfig, SaProtConfig @@ -265,7 +264,6 @@ class EvollaSaProtSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -273,7 +271,6 @@ class EvollaSaProtSelfAttention(nn.Module): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: hidden_shape = (hidden_states.shape[0], -1, self.num_attention_heads, self.attention_head_size) @@ -378,7 +375,6 @@ class EvollaSaProtFlashAttention2(EvollaSaProtSelfAttention): self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() self.dropout_prob = config.attention_probs_dropout_prob - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, @@ -386,7 +382,6 @@ class EvollaSaProtFlashAttention2(EvollaSaProtSelfAttention): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: # Flash attention doesn't support output_attentions or cross attention @@ -508,7 +503,6 @@ class EvollaSaProtAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states, @@ -516,7 +510,6 @@ class EvollaSaProtAttention(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, cache_position=None, ): @@ -581,7 +574,6 @@ class EvollaSaProtLayer(GradientCheckpointingLayer): self.output = EvollaSaProtOutput(config) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states, @@ -589,7 +581,6 @@ class EvollaSaProtLayer(GradientCheckpointingLayer): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, cache_position=None, ): @@ -649,8 +640,6 @@ class EvollaSaProtEncoder(nn.Module): self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.gradient_checkpointing = False - @deprecate_kwarg("past_key_value", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, @@ -659,8 +648,6 @@ class EvollaSaProtEncoder(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_values=None, - use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 2ce51042ed..a76e390012 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -39,7 +39,6 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from .configuration_gemma2 import Gemma2Config @@ -252,7 +251,6 @@ class Gemma2DecoderLayer(GradientCheckpointingLayer): self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 38d7c8cd09..1f22987e67 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -29,7 +29,6 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging -from ...utils.deprecation import deprecate_kwarg from ..gemma.modeling_gemma import ( GemmaAttention, GemmaForCausalLM, @@ -317,7 +316,6 @@ class Gemma2DecoderLayer(GradientCheckpointingLayer): self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 307b671844..76eb0b697b 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -46,7 +46,6 @@ from ...utils import ( is_torchdynamo_compiling, logging, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from ..auto import AutoModel from .configuration_gemma3 import Gemma3Config, Gemma3TextConfig @@ -366,7 +365,6 @@ class Gemma3DecoderLayer(GradientCheckpointingLayer): self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 5bf158c00f..c2ad52f108 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -32,7 +32,6 @@ from ...modeling_rope_utils import rope_config_validation from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging -from ...utils.deprecation import deprecate_kwarg from ..gemma2.configuration_gemma2 import Gemma2Config from ..gemma2.modeling_gemma2 import ( Gemma2Attention, @@ -464,7 +463,6 @@ class Gemma3DecoderLayer(GradientCheckpointingLayer): self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 3c304bbcf6..8a00675487 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -47,7 +47,6 @@ from ...utils import ( is_torchdynamo_compiling, logging, ) -from ...utils.deprecation import deprecate_kwarg from ..auto import AutoModel from .configuration_gemma3n import Gemma3nAudioConfig, Gemma3nConfig, Gemma3nTextConfig, Gemma3nVisionConfig @@ -1408,7 +1407,6 @@ class Gemma3nTextDecoderLayer(GradientCheckpointingLayer): self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False) self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) - @deprecate_kwarg("last_cache_position", version="4.53.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 80442af911..c3155b17ea 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -46,7 +46,6 @@ from ...utils import ( auto_docstring, logging, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.model_parallel_utils import assert_device_map, get_device_map from .configuration_gpt2 import GPT2Config @@ -266,7 +265,6 @@ class GPT2Attention(nn.Module): return attn_output, attn_weights - @deprecate_kwarg("layer_past", new_name="past_key_value", version="4.53.0", raise_if_both_names=True) def forward( self, hidden_states: Optional[tuple[torch.FloatTensor]], @@ -385,7 +383,6 @@ class GPT2Block(GradientCheckpointingLayer): self.mlp = GPT2MLP(inner_dim, config) - @deprecate_kwarg("layer_past", new_name="past_key_value", version="4.53.0", raise_if_both_names=True) def forward( self, hidden_states: Optional[tuple[torch.FloatTensor]], diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 5d97185d94..3bd0b98806 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -26,7 +26,6 @@ from ...image_utils import AnnotationFormat, ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import TensorType, is_torch_available -from ...utils.deprecation import deprecate_kwarg if is_torch_available(): @@ -239,7 +238,6 @@ class GroundingDinoProcessor(ProcessorMixin): image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - @deprecate_kwarg("box_threshold", new_name="threshold", version="4.51.0") def post_process_grounded_object_detection( self, outputs: "GroundingDinoObjectDetectionOutput", diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 63df33beeb..19f3302c5a 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -37,7 +37,6 @@ from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassif from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, is_torch_flex_attn_available, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_hubert import HubertConfig @@ -301,7 +300,6 @@ class HubertAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index bcafeeec1e..55d7d32f0f 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -23,7 +23,6 @@ import torch.utils.checkpoint from torch import nn from ...activations import ACT2FN -from ...cache_utils import Cache from ...generation import GenerationMixin from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer @@ -37,7 +36,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from ..auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM from .configuration_instructblip import InstructBlipConfig, InstructBlipQFormerConfig, InstructBlipVisionConfig @@ -558,7 +556,6 @@ class InstructBlipQFormerMultiHeadAttention(nn.Module): x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states, @@ -566,7 +563,6 @@ class InstructBlipQFormerMultiHeadAttention(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, ): # If this is instantiated as a cross-attention module, the keys @@ -679,7 +675,6 @@ class InstructBlipQFormerAttention(nn.Module): self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states: torch.Tensor, @@ -687,7 +682,6 @@ class InstructBlipQFormerAttention(nn.Module): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: self_outputs = self.attention( @@ -755,7 +749,6 @@ class InstructBlipQFormerLayer(GradientCheckpointingLayer): self.intermediate_query = InstructBlipQFormerIntermediate(config) self.output_query = InstructBlipQFormerOutput(config) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states, @@ -763,7 +756,6 @@ class InstructBlipQFormerLayer(GradientCheckpointingLayer): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, query_length=0, ): @@ -841,8 +833,6 @@ class InstructBlipQFormerEncoder(nn.Module): ) self.gradient_checkpointing = False - @deprecate_kwarg("past_key_value", version="4.55.0") - @deprecate_kwarg("use_cache", version="4.55.0") def forward( self, hidden_states, @@ -850,8 +840,6 @@ class InstructBlipQFormerEncoder(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_values=None, - use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, @@ -1035,8 +1023,6 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel): extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask - @deprecate_kwarg("past_key_value", version="4.55.0") - @deprecate_kwarg("use_cache", version="4.55.0") def forward( self, input_ids: torch.LongTensor, @@ -1046,8 +1032,6 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 8e098183e2..6f5b0f8f04 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -27,7 +27,6 @@ import torch from torch import nn from ...activations import ACT2FN -from ...cache_utils import Cache from ...generation import GenerationMixin from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer @@ -41,7 +40,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.deprecation import deprecate_kwarg from ..auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM from .configuration_instructblipvideo import ( InstructBlipVideoConfig, @@ -423,7 +421,6 @@ class InstructBlipVideoQFormerMultiHeadAttention(nn.Module): x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states, @@ -431,7 +428,6 @@ class InstructBlipVideoQFormerMultiHeadAttention(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, ): # If this is instantiated as a cross-attention module, the keys @@ -542,7 +538,6 @@ class InstructBlipVideoQFormerAttention(nn.Module): self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states: torch.Tensor, @@ -550,7 +545,6 @@ class InstructBlipVideoQFormerAttention(nn.Module): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, ) -> tuple[torch.Tensor]: self_outputs = self.attention( @@ -616,7 +610,6 @@ class InstructBlipVideoQFormerLayer(GradientCheckpointingLayer): self.intermediate_query = InstructBlipVideoQFormerIntermediate(config) self.output_query = InstructBlipVideoQFormerOutput(config) - @deprecate_kwarg("past_key_value", version="4.55.0") def forward( self, hidden_states, @@ -624,7 +617,6 @@ class InstructBlipVideoQFormerLayer(GradientCheckpointingLayer): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_value=None, output_attentions=False, query_length=0, ): @@ -701,8 +693,6 @@ class InstructBlipVideoQFormerEncoder(nn.Module): ) self.gradient_checkpointing = False - @deprecate_kwarg("past_key_value", version="4.55.0") - @deprecate_kwarg("use_cache", version="4.55.0") def forward( self, hidden_states, @@ -710,8 +700,6 @@ class InstructBlipVideoQFormerEncoder(nn.Module): head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - past_key_values=None, - use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, @@ -996,8 +984,6 @@ class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel): extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask - @deprecate_kwarg("past_key_value", version="4.55.0") - @deprecate_kwarg("use_cache", version="4.55.0") def forward( self, input_ids: torch.LongTensor, @@ -1007,8 +993,6 @@ class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel): head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 22f8745f5b..b3b79ef99d 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -34,7 +34,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_layoutlm import LayoutLMConfig @@ -171,17 +170,11 @@ class LayoutLMSelfAttention(nn.Module): self.attention_dropout = config.attention_probs_dropout_prob self.scaling = self.attention_head_size**-0.5 - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -254,17 +247,11 @@ class LayoutLMAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -321,17 +308,11 @@ class LayoutLMLayer(GradientCheckpointingLayer): self.intermediate = LayoutLMIntermediate(config) self.output = LayoutLMOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -366,20 +347,12 @@ class LayoutLMEncoder(nn.Module): self.layer = nn.ModuleList([LayoutLMLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, @@ -537,8 +510,6 @@ class LayoutLMModel(LayoutLMPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -550,8 +521,6 @@ class LayoutLMModel(LayoutLMPreTrainedModel): position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -684,8 +653,6 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -698,8 +665,6 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 3bf6553859..0dd845ecff 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -40,7 +40,6 @@ from ...modeling_utils import ( prune_linear_layer, ) from ...utils import auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_markuplm import MarkupLMConfig @@ -378,17 +377,11 @@ class MarkupLMSelfAttention(nn.Module): self.attention_dropout = config.attention_probs_dropout_prob self.scaling = self.attention_head_size**-0.5 - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -446,17 +439,11 @@ class MarkupLMAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -482,17 +469,11 @@ class MarkupLMLayer(GradientCheckpointingLayer): self.intermediate = MarkupLMIntermediate(config) self.output = MarkupLMOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -527,20 +508,12 @@ class MarkupLMEncoder(nn.Module): self.layer = nn.ModuleList([MarkupLMLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 20610e7c77..2ccc3bb67b 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -50,7 +50,6 @@ from ...utils import ( is_torch_tensor, logging, ) -from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) @@ -442,9 +441,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values", "pixel_mask"] - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") - @deprecate_kwarg("size_divisibility", new_name="size_divisor", version="4.41.0") - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) @filter_out_non_signature_kwargs(extra=["max_size", *INIT_SERVICE_KWARGS]) def __init__( self, @@ -486,21 +482,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): self.num_labels = num_labels self.pad_size = pad_size - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `Mask2FormerImageProcessor.from_pretrained(checkpoint, max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "size_divisibility" in kwargs: - image_processor_dict["size_divisor"] = kwargs.pop("size_divisibility") - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.to_dict def to_dict(self) -> dict[str, Any]: """ @@ -511,7 +492,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): image_processor_dict.pop("_max_size", None) return image_processor_dict - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.resize with get_maskformer_resize_output_image_size->get_mask2former_resize_output_image_size def resize( self, @@ -722,7 +702,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): segmentation_map = segmentation_map.squeeze(0) return segmentation_map - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") @filter_out_non_signature_kwargs() def preprocess( self, diff --git a/src/transformers/models/mask2former/image_processing_mask2former_fast.py b/src/transformers/models/mask2former/image_processing_mask2former_fast.py index 1f61e9b0cd..0b942a3264 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former_fast.py +++ b/src/transformers/models/mask2former/image_processing_mask2former_fast.py @@ -47,7 +47,6 @@ from ...utils import ( is_torchvision_v2_available, logging, ) -from ...utils.deprecation import deprecate_kwarg from .image_processing_mask2former import ( compute_segments, convert_segmentation_to_rle, @@ -155,9 +154,6 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast): do_reduce_labels = False valid_kwargs = Mask2FormerFastImageProcessorKwargs - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") - @deprecate_kwarg("size_divisibility", new_name="size_divisor", version="4.41.0") - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) def __init__(self, **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -175,21 +171,6 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast): super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `Mask2FormerImageProcessor.from_pretrained(checkpoint, max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "size_divisibility" in kwargs: - image_processor_dict["size_divisor"] = kwargs.pop("size_divisibility") - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - def to_dict(self) -> dict[str, Any]: """ Serializes this instance to a Python dictionary. This method calls the superclass method and then removes the diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 52dc99b4c3..8896dad734 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -51,7 +51,6 @@ from ...utils import ( is_torch_tensor, logging, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.import_utils import requires @@ -448,9 +447,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values", "pixel_mask"] - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") - @deprecate_kwarg("size_divisibility", new_name="size_divisor", version="4.41.0") - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) @filter_out_non_signature_kwargs(extra=["max_size", *INIT_SERVICE_KWARGS]) def __init__( self, @@ -492,21 +488,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): self.num_labels = num_labels self.pad_size = pad_size - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `MaskFormerImageProcessor.from_pretrained(checkpoint, max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "size_divisibility" in kwargs: - image_processor_dict["size_divisor"] = kwargs.pop("size_divisibility") - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - def to_dict(self) -> dict[str, Any]: """ Serializes this instance to a Python dictionary. This method calls the superclass method and then removes the @@ -516,7 +497,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): image_processor_dict.pop("_max_size", None) return image_processor_dict - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) def resize( self, image: np.ndarray, @@ -725,7 +705,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): segmentation_map = segmentation_map.squeeze(0) return segmentation_map - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") @filter_out_non_signature_kwargs() def preprocess( self, diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py index bdd13afbe6..5a9961fa65 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py +++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py @@ -44,7 +44,6 @@ from ...utils import ( is_torchvision_v2_available, logging, ) -from ...utils.deprecation import deprecate_kwarg from .image_processing_maskformer import ( compute_segments, convert_segmentation_to_rle, @@ -156,9 +155,6 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast): do_reduce_labels = False valid_kwargs = MaskFormerFastImageProcessorKwargs - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") - @deprecate_kwarg("size_divisibility", new_name="size_divisor", version="4.41.0") - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) def __init__(self, **kwargs: Unpack[MaskFormerFastImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -176,21 +172,6 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast): super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `MaskFormerImageProcessor.from_pretrained(checkpoint, max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "size_divisibility" in kwargs: - image_processor_dict["size_divisor"] = kwargs.pop("size_divisibility") - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - def to_dict(self) -> dict[str, Any]: """ Serializes this instance to a Python dictionary. This method calls the superclass method and then removes the diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index d55734c267..0f90ec3f6a 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -44,7 +44,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_megatron_bert import MegatronBertConfig @@ -207,14 +206,12 @@ class MegatronBertSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -225,13 +222,7 @@ class MegatronBertSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py index 4e526b4f20..5185bee97c 100644 --- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py @@ -29,7 +29,6 @@ from ...utils import ( is_torch_available, is_torchvision_available, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.import_utils import requires @@ -319,8 +318,6 @@ class OmDetTurboProcessor(ProcessorMixin): ) return height, width - @deprecate_kwarg("score_threshold", new_name="threshold", version="4.51.0") - @deprecate_kwarg("classes", new_name="text_labels", version="4.51.0") def post_process_grounded_object_detection( self, outputs: "OmDetTurboObjectDetectionOutput", diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index 4268edb13c..5a66918af1 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -53,7 +53,6 @@ from ...utils import ( is_torch_tensor, logging, ) -from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) @@ -425,8 +424,6 @@ class OneFormerImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values", "pixel_mask", "task_inputs"] - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.44.0") - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) @filter_out_non_signature_kwargs(extra=["max_size", "metadata", *INIT_SERVICE_KWARGS]) def __init__( self, @@ -473,16 +470,6 @@ class OneFormerImageProcessor(BaseImageProcessor): self.num_text = num_text self.num_labels = num_labels - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs - """ - image_processor_dict = image_processor_dict.copy() - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.to_dict def to_dict(self) -> dict[str, Any]: """ @@ -493,7 +480,6 @@ class OneFormerImageProcessor(BaseImageProcessor): image_processor_dict.pop("_max_size", None) return image_processor_dict - @deprecate_kwarg("max_size", version="4.27.0", warn_if_greater_or_equal_version=True) @filter_out_non_signature_kwargs(extra=["max_size"]) def resize( self, diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py index 981575f42a..fcafe3ebac 100644 --- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py +++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py @@ -29,7 +29,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_patchtsmixer import PatchTSMixerConfig @@ -304,7 +303,6 @@ class PatchTSMixerAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 3f4b0c95e4..9a22657cbf 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -28,7 +28,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import ModelOutput, auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_patchtst import PatchTSTConfig @@ -101,7 +100,6 @@ class PatchTSTAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py index cbe7482c74..9a892e6838 100644 --- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py @@ -16,7 +16,6 @@ Processor class for Qwen2Audio. """ -import warnings from typing import Union import numpy as np @@ -24,7 +23,6 @@ import numpy as np from ...feature_extraction_utils import BatchFeature from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils.deprecation import deprecate_kwarg class Qwen2AudioProcessorKwargs(ProcessingKwargs, total=False): @@ -80,12 +78,10 @@ class Qwen2AudioProcessor(ProcessorMixin): self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token super().__init__(feature_extractor, tokenizer, chat_template=chat_template) - @deprecate_kwarg("audios", version="4.54.0", new_name="audio") def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, audio: Union[np.ndarray, list[np.ndarray]] = None, - audios=None, # kept for BC **kwargs: Unpack[Qwen2AudioProcessorKwargs], ) -> BatchFeature: """ @@ -103,16 +99,6 @@ class Qwen2AudioProcessor(ProcessorMixin): audio (`np.ndarray`, `list[np.ndarray]`): The audio or batch of audios to be prepared. Each audio can be a NumPy array. """ - - # Handle BC when user passes deprecated keyword argument - if audios is not None and audio is None: - audio = audios - warnings.warn( - "You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments " - "with keys `audio` and `text`. From transformers v4.55 `audio` will be the only acceptable keyword argument.", - FutureWarning, - ) - if text is None: raise ValueError("You need to specify `text` input to process.") elif isinstance(text, str): diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index c8a81319f3..bd27f3eeec 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -40,7 +40,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_rembert import RemBertConfig @@ -222,14 +221,12 @@ class RemBertSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, cache_position: Optional[torch.Tensor] = None, @@ -241,13 +238,7 @@ class RemBertSelfAttention(nn.Module): .transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -355,7 +346,6 @@ class RemBertAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") # Copied from transformers.models.bert.modeling_bert.BertAttention.forward def forward( self, @@ -363,7 +353,6 @@ class RemBertAttention(nn.Module): attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -373,7 +362,6 @@ class RemBertAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 998685ccc1..f9fc9c3b58 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -42,7 +42,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, get_torch_version, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_roberta import RobertaConfig @@ -167,14 +166,12 @@ class RobertaSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -185,13 +182,7 @@ class RobertaSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -286,14 +277,12 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention): self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") # Adapted from RobertaSelfAttention - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -312,7 +301,6 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, @@ -324,12 +312,7 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention): self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention - # mask needs to be such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - current_states = encoder_hidden_states if is_cross_attention else hidden_states if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): @@ -451,14 +434,12 @@ class RobertaAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -468,7 +449,6 @@ class RobertaAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index d778a42703..a809d19905 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -40,7 +40,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig @@ -166,14 +165,12 @@ class RobertaPreLayerNormSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -184,13 +181,7 @@ class RobertaPreLayerNormSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -319,14 +310,12 @@ class RobertaPreLayerNormAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -337,7 +326,6 @@ class RobertaPreLayerNormAttention(nn.Module): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 38111c817c..be5c725269 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -40,7 +40,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_roc_bert import RoCBertConfig @@ -281,14 +280,12 @@ class RoCBertSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -299,13 +296,7 @@ class RoCBertSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -442,14 +433,12 @@ class RoCBertAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -459,7 +448,6 @@ class RoCBertAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py index aa62ce1bc1..9cb05f8118 100644 --- a/src/transformers/models/segformer/image_processing_segformer.py +++ b/src/transformers/models/segformer/image_processing_segformer.py @@ -14,7 +14,7 @@ # limitations under the License. """Image processor class for Segformer.""" -from typing import Any, Optional, Union +from typing import Optional, Union import numpy as np @@ -41,7 +41,6 @@ from ...utils import ( is_vision_available, logging, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.import_utils import requires @@ -94,7 +93,6 @@ class SegformerImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0") @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS) def __init__( self, @@ -122,16 +120,6 @@ class SegformerImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_reduce_labels = do_reduce_labels - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs - """ - image_processor_dict = image_processor_dict.copy() - if "reduce_labels" in image_processor_dict: - image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize def resize( self, @@ -304,7 +292,6 @@ class SegformerImageProcessor(BaseImageProcessor): """ return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) - @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0") @filter_out_non_signature_kwargs() def preprocess( self, diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 97dd63a548..3b985bce20 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -37,7 +37,6 @@ from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassif from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_sew import SEWConfig @@ -294,7 +293,6 @@ class SEWAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index a5d68a5a52..905c7a27ad 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -32,7 +32,6 @@ from ...utils import ( can_return_tuple, logging, ) -from ...utils.deprecation import deprecate_kwarg from .configuration_splinter import SplinterConfig @@ -143,17 +142,11 @@ class SplinterSelfAttention(nn.Module): self.attention_dropout = config.attention_probs_dropout_prob self.scaling = self.attention_head_size**-0.5 - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -226,17 +219,11 @@ class SplinterAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -293,17 +280,11 @@ class SplinterLayer(GradientCheckpointingLayer): self.intermediate = SplinterIntermediate(config) self.output = SplinterOutput(config) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> tuple[torch.Tensor]: @@ -338,20 +319,12 @@ class SplinterEncoder(nn.Module): self.layer = nn.ModuleList([SplinterLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, @@ -443,10 +416,6 @@ class SplinterModel(SplinterPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @deprecate_kwarg("encoder_hidden_states", version="4.54.0") - @deprecate_kwarg("encoder_attention_mask", version="4.54.0") - @deprecate_kwarg("past_key_values", version="4.54.0") - @deprecate_kwarg("use_cache", version="4.54.0") @can_return_tuple @auto_docstring def forward( @@ -457,10 +426,6 @@ class SplinterModel(SplinterPreTrainedModel): position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.FloatTensor]] = None, - use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index a4c0f68fbc..e04daf76b2 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -32,7 +32,6 @@ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, Mas from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_tapas import TapasConfig @@ -301,14 +300,12 @@ class TapasSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, - encoder_attention_mask=None, past_key_value=None, output_attentions=False, cache_position=None, @@ -320,13 +317,7 @@ class TapasSelfAttention(nn.Module): .transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -436,7 +427,6 @@ class TapasAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") # Copied from transformers.models.bert.modeling_bert.BertAttention.forward def forward( self, @@ -444,7 +434,6 @@ class TapasAttention(nn.Module): attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -454,7 +443,6 @@ class TapasAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 46aaecbbcc..233de0d14c 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -45,7 +45,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, is_torch_flex_attn_available, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_unispeech import UniSpeechConfig @@ -333,7 +332,6 @@ class UniSpeechAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index c32a8a35a1..c7fc63849c 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -47,7 +47,6 @@ from ...modeling_outputs import ( from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, is_peft_available, is_torch_flex_attn_available, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_unispeech_sat import UniSpeechSatConfig @@ -338,7 +337,6 @@ class UniSpeechSatAttention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index ab3f38d644..2c352da180 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -55,7 +55,6 @@ from ...utils import ( is_torch_flex_attn_available, logging, ) -from ...utils.deprecation import deprecate_kwarg from .configuration_wav2vec2 import Wav2Vec2Config @@ -525,7 +524,6 @@ class Wav2Vec2Attention(nn.Module): self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - @deprecate_kwarg("past_key_value", version="4.54.0") def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 57aa4385bb..4ff6ea8cf2 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -42,7 +42,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, get_torch_version, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_xlm_roberta import XLMRobertaConfig @@ -168,14 +167,12 @@ class XLMRobertaSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -186,13 +183,7 @@ class XLMRobertaSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -287,14 +278,12 @@ class XLMRobertaSdpaSelfAttention(XLMRobertaSelfAttention): self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") # Adapted from XLMRobertaSelfAttention - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -313,7 +302,6 @@ class XLMRobertaSdpaSelfAttention(XLMRobertaSelfAttention): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, @@ -325,12 +313,7 @@ class XLMRobertaSdpaSelfAttention(XLMRobertaSelfAttention): self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention - # mask needs to be such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - current_states = encoder_hidden_states if is_cross_attention else hidden_states if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): @@ -452,14 +435,12 @@ class XLMRobertaAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -469,7 +450,6 @@ class XLMRobertaAttention(nn.Module): attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_value, output_attentions=output_attentions, cache_position=cache_position, diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 10f1df128a..69c623934e 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -41,7 +41,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, get_torch_version, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_xlm_roberta_xl import XLMRobertaXLConfig @@ -165,14 +164,12 @@ class XLMRobertaXLSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -183,13 +180,7 @@ class XLMRobertaXLSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -284,14 +275,12 @@ class XLMRobertaXLSdpaSelfAttention(XLMRobertaXLSelfAttention): self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") # Adapted from XLMRobertaXLSelfAttention - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -310,7 +299,6 @@ class XLMRobertaXLSdpaSelfAttention(XLMRobertaXLSelfAttention): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, @@ -322,12 +310,7 @@ class XLMRobertaXLSdpaSelfAttention(XLMRobertaXLSelfAttention): self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) ) - # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention - # mask needs to be such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - current_states = encoder_hidden_states if is_cross_attention else hidden_states if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): @@ -447,14 +430,12 @@ class XLMRobertaXLAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, - encoder_attention_mask=None, past_key_value=None, output_attentions=False, cache_position=None, @@ -465,7 +446,6 @@ class XLMRobertaXLAttention(nn.Module): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index 10baf5b6e3..11566a99d2 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -39,7 +39,6 @@ from ...modeling_outputs import ( from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg from .configuration_xmod import XmodConfig @@ -165,14 +164,12 @@ class XmodSelfAttention(nn.Module): self.is_decoder = config.is_decoder self.layer_idx = layer_idx - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -183,13 +180,7 @@ class XmodSelfAttention(nn.Module): 1, 2 ) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None - if is_cross_attention and encoder_attention_mask is not None: - attention_mask = encoder_attention_mask - if past_key_value is not None: if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -318,14 +309,12 @@ class XmodAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("encoder_attention_mask", version="4.55.0") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.Tensor] = None, @@ -338,7 +327,6 @@ class XmodAttention(nn.Module): attention_mask, head_mask, encoder_hidden_states, - encoder_attention_mask, past_key_value, output_attentions, cache_position, diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py index c2685df19f..526f481eb9 100644 --- a/tests/models/mask2former/test_image_processing_mask2former.py +++ b/tests/models/mask2former/test_image_processing_mask2former.py @@ -191,18 +191,6 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase self.assertTrue(hasattr(image_processing, "ignore_index")) self.assertTrue(hasattr(image_processing, "num_labels")) - def test_image_processor_from_dict_with_kwargs(self): - for image_processing_class in self.image_processor_list: - image_processor = image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"shortest_edge": 32, "longest_edge": 1333}) - self.assertEqual(image_processor.size_divisor, 0) - - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, size_divisibility=8 - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.size_divisor, 8) - def comm_get_image_processing_inputs( self, image_processor_tester, @@ -568,20 +556,6 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase num_segments_fused = max([el["id"] for el in el_fused]) self.assertEqual(num_segments_fused, expected_num_segments) - def test_removed_deprecated_kwargs(self): - image_processor_dict = dict(self.image_processor_dict) - image_processor_dict.pop("do_reduce_labels", None) - image_processor_dict["reduce_labels"] = True - - # test we are able to create the image processor with the deprecated kwargs - for image_processing_class in self.image_processor_list: - image_processor = image_processing_class(**image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - - # test we still support reduce_labels with config - image_processor = image_processing_class.from_dict(image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - def test_slow_fast_equivalence(self): if not self.test_slow_image_processor or not self.test_fast_image_processor: self.skipTest(reason="Skipping slow/fast equivalence test") diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py index 01a9f0d086..4479783723 100644 --- a/tests/models/maskformer/test_image_processing_maskformer.py +++ b/tests/models/maskformer/test_image_processing_maskformer.py @@ -187,18 +187,6 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) self.assertTrue(hasattr(image_processing, "ignore_index")) self.assertTrue(hasattr(image_processing, "num_labels")) - def test_image_processor_from_dict_with_kwargs(self): - for image_processing_class in self.image_processor_list: - image_processor = image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"shortest_edge": 32, "longest_edge": 1333}) - self.assertEqual(image_processor.size_divisor, 0) - - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, size_divisibility=8 - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.size_divisor, 8) - def comm_get_image_processing_inputs( self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np" ): @@ -556,19 +544,6 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) num_segments_fused = max([el["id"] for el in el_fused]) self.assertEqual(num_segments_fused, expected_num_segments) - def test_removed_deprecated_kwargs(self): - image_processor_dict = dict(self.image_processor_dict) - image_processor_dict.pop("do_reduce_labels", None) - image_processor_dict["reduce_labels"] = True - - # test we are able to create the image processor with the deprecated kwargs - image_processor = self.image_processing_class(**image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - - # test we still support reduce_labels with config - image_processor = self.image_processing_class.from_dict(image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - def test_slow_fast_equivalence(self): if not self.test_slow_image_processor or not self.test_fast_image_processor: self.skipTest(reason="Skipping slow/fast equivalence test") diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py index ecdb4bca1f..d201c70409 100644 --- a/tests/models/oneformer/test_image_processing_oneformer.py +++ b/tests/models/oneformer/test_image_processing_oneformer.py @@ -377,20 +377,6 @@ class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): self.assertEqual(image_processor.metadata, metadata) - def test_removed_deprecated_kwargs(self): - image_processor_dict = dict(self.image_processor_dict) - image_processor_dict.pop("do_reduce_labels", None) - image_processor_dict["reduce_labels"] = True - # Only test for OneFormerImageProcessor - image_processing_class = self.image_processing_class - # test we are able to create the image processor with the deprecated kwargs - image_processor = image_processing_class(**image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - - # test we still support reduce_labels with config - image_processor = image_processing_class.from_dict(image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - def test_slow_fast_equivalence(self): if not self.test_slow_image_processor or not self.test_fast_image_processor: self.skipTest(reason="Skipping slow/fast equivalence test") diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py index f03d9c4fd6..3bf1eb1f6c 100644 --- a/tests/models/segformer/test_image_processing_segformer.py +++ b/tests/models/segformer/test_image_processing_segformer.py @@ -247,16 +247,3 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): encoding = image_processing(image, map, return_tensors="pt") self.assertTrue(encoding["labels"].min().item() >= 0) self.assertTrue(encoding["labels"].max().item() <= 255) - - def test_removed_deprecated_kwargs(self): - image_processor_dict = dict(self.image_processor_dict) - image_processor_dict.pop("do_reduce_labels", None) - image_processor_dict["reduce_labels"] = True - - # test we are able to create the image processor with the deprecated kwargs - image_processor = self.image_processing_class(**image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True) - - # test we still support reduce_labels with config - image_processor = self.image_processing_class.from_dict(image_processor_dict) - self.assertEqual(image_processor.do_reduce_labels, True)