From de77f5b1ec37ea214cda08a1ca33a841ce097d75 Mon Sep 17 00:00:00 2001 From: cyyever Date: Thu, 27 Mar 2025 22:46:32 +0800 Subject: [PATCH] Fix typing for None valued variables (#37004) Fix typing for None-able variables --- src/transformers/model_debugging_utils.py | 3 +- src/transformers/modeling_outputs.py | 112 +++++++++--------- .../bamba/convert_mamba_ssm_checkpoint.py | 4 +- .../models/bark/processing_bark.py | 2 +- .../models/big_bird/modeling_flax_big_bird.py | 6 +- .../models/bit/image_processing_bit.py | 2 +- .../chameleon/image_processing_chameleon.py | 2 +- .../image_processing_chinese_clip.py | 2 +- .../models/clap/feature_extraction_clap.py | 4 +- .../models/clip/image_processing_clip.py | 2 +- .../models/clipseg/modeling_clipseg.py | 2 +- src/transformers/models/dac/modeling_dac.py | 6 +- .../deberta_v2/tokenization_deberta_v2.py | 2 +- .../image_processing_efficientformer.py | 2 +- .../graphormer/configuration_graphormer.py | 6 +- .../transfo_xl/tokenization_transfo_xl.py | 2 +- .../deprecated/tvlt/image_processing_tvlt.py | 2 +- .../deprecated/van/convert_van_to_pytorch.py | 4 +- .../vit_hybrid/image_processing_vit_hybrid.py | 2 +- .../models/dpt/image_processing_dpt.py | 18 +-- .../modeling_encoder_decoder.py | 4 +- .../modeling_tf_encoder_decoder.py | 4 +- .../models/esm/configuration_esm.py | 2 +- .../models/flava/image_processing_flava.py | 2 +- .../models/gemma3/image_processing_gemma3.py | 8 +- .../models/idefics/modeling_idefics.py | 6 +- .../models/idefics2/processing_idefics2.py | 4 +- .../models/idefics3/processing_idefics3.py | 4 +- .../levit/convert_levit_timm_to_pytorch.py | 3 +- .../llava_next/image_processing_llava_next.py | 4 +- .../image_processing_llava_next_video.py | 4 +- .../image_processing_mask2former.py | 4 +- .../mask2former/modeling_mask2former.py | 10 +- .../maskformer/image_processing_maskformer.py | 4 +- src/transformers/models/mimi/modeling_mimi.py | 2 +- .../models/moonshine/modeling_moonshine.py | 2 +- .../models/moonshine/modular_moonshine.py | 2 +- .../models/moshi/modeling_moshi.py | 4 +- .../models/musicgen/modeling_musicgen.py | 16 +-- .../modeling_musicgen_melody.py | 16 +-- .../models/nougat/tokenization_nougat_fast.py | 4 +- .../oneformer/image_processing_oneformer.py | 2 +- src/transformers/models/opt/modeling_opt.py | 4 +- .../poolformer/image_processing_poolformer.py | 2 +- .../image_processing_prompt_depth_anything.py | 2 +- .../qwen2_vl/image_processing_qwen2_vl.py | 20 ++-- .../image_processing_qwen2_vl_fast.py | 10 +- src/transformers/models/rag/modeling_rag.py | 4 +- .../models/rag/modeling_tf_rag.py | 4 +- .../models/rag/tokenization_rag.py | 2 +- .../convert_regnet_seer_10b_to_pytorch.py | 4 +- .../regnet/convert_regnet_to_pytorch.py | 4 +- .../resnet/convert_resnet_to_pytorch.py | 4 +- .../models/roc_bert/tokenization_roc_bert.py | 4 +- .../models/sam/image_processing_sam.py | 4 +- .../models/segformer/modeling_tf_segformer.py | 4 +- .../models/siglip/processing_siglip.py | 2 +- .../models/smolvlm/processing_smolvlm.py | 4 +- .../modeling_speech_encoder_decoder.py | 4 +- .../models/tapas/tokenization_tapas.py | 8 +- .../textnet/image_processing_textnet.py | 4 +- .../models/trocr/modeling_trocr.py | 4 +- .../image_processing_video_llava.py | 4 +- .../modeling_tf_vision_encoder_decoder.py | 4 +- .../modeling_vision_encoder_decoder.py | 4 +- .../modeling_flax_vision_text_dual_encoder.py | 4 +- .../modeling_tf_vision_text_dual_encoder.py | 4 +- .../modeling_vision_text_dual_encoder.py | 4 +- .../models/vitpose/configuration_vitpose.py | 8 +- .../models/whisper/modeling_whisper.py | 2 +- .../models/whisper/tokenization_whisper.py | 6 +- .../whisper/tokenization_whisper_fast.py | 4 +- .../models/zamba/modeling_zamba.py | 4 +- .../models/zamba2/modeling_zamba2.py | 14 +-- .../models/zamba2/modular_zamba2.py | 12 +- .../zoedepth/image_processing_zoedepth.py | 4 +- src/transformers/onnx/config.py | 6 +- src/transformers/onnx/convert.py | 10 +- src/transformers/tokenization_utils_base.py | 20 ++-- src/transformers/trainer.py | 12 +- 80 files changed, 271 insertions(+), 249 deletions(-) diff --git a/src/transformers/model_debugging_utils.py b/src/transformers/model_debugging_utils.py index d45586aee1..2589856a19 100644 --- a/src/transformers/model_debugging_utils.py +++ b/src/transformers/model_debugging_utils.py @@ -19,6 +19,7 @@ import json import os import re from contextlib import contextmanager +from typing import Optional from transformers.utils.import_utils import export @@ -284,7 +285,7 @@ def model_addition_debugger(cls): @export(backends=("torch",)) @contextmanager -def model_addition_debugger_context(model, debug_path: str = None): +def model_addition_debugger_context(model, debug_path: Optional[str] = None): """ # Model addition debugger - context manager for model adders This context manager is a power user tool intended for model adders. diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py index 7328e05186..60a3642f87 100755 --- a/src/transformers/modeling_outputs.py +++ b/src/transformers/modeling_outputs.py @@ -42,7 +42,7 @@ class BaseModelOutput(ModelOutput): heads. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -62,7 +62,7 @@ class BaseModelOutputWithNoAttention(ModelOutput): Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -92,8 +92,8 @@ class BaseModelOutputWithPooling(ModelOutput): heads. """ - last_hidden_state: torch.FloatTensor = None - pooler_output: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -115,8 +115,8 @@ class BaseModelOutputWithPoolingAndNoAttention(ModelOutput): Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. """ - last_hidden_state: torch.FloatTensor = None - pooler_output: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -153,7 +153,7 @@ class BaseModelOutputWithPast(ModelOutput): heads. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -186,7 +186,7 @@ class BaseModelOutputWithCrossAttentions(ModelOutput): weighted average in the cross-attention heads. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -233,8 +233,8 @@ class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): input) to speed up sequential decoding. """ - last_hidden_state: torch.FloatTensor = None - pooler_output: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -280,7 +280,7 @@ class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): weighted average in the cross-attention heads. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -327,12 +327,12 @@ class MoECausalLMOutputWithPast(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - z_loss: torch.FloatTensor = None - aux_loss: torch.FloatTensor = None + z_loss: Optional[torch.FloatTensor] = None + aux_loss: Optional[torch.FloatTensor] = None router_logits: Optional[Tuple[torch.FloatTensor]] = None @@ -362,7 +362,7 @@ class MoEModelOutput(ModelOutput): loss and the z_loss for Mixture of Experts models. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None router_probs: Optional[Tuple[torch.FloatTensor]] = None @@ -403,7 +403,7 @@ class MoeModelOutputWithPast(ModelOutput): loss for Mixture of Experts models. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -452,7 +452,7 @@ class MoeCausalLMOutputWithPast(ModelOutput): loss: Optional[torch.FloatTensor] = None aux_loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -504,7 +504,7 @@ class MoEModelOutputWithPastAndCrossAttentions(ModelOutput): loss and the z_loss for Mixture of Experts models. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -563,7 +563,7 @@ class Seq2SeqModelOutput(ModelOutput): self-attention heads. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -633,7 +633,7 @@ class Seq2SeqMoEModelOutput(ModelOutput): modules. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -669,7 +669,7 @@ class CausalLMOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -704,7 +704,7 @@ class CausalLMOutputWithPast(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -747,7 +747,7 @@ class CausalLMOutputWithCrossAttentions(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -784,7 +784,7 @@ class SequenceClassifierOutputWithPast(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -814,7 +814,7 @@ class MaskedLMOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -869,7 +869,7 @@ class Seq2SeqLMOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -938,11 +938,11 @@ class Seq2SeqMoEOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - encoder_z_loss: torch.FloatTensor = None - decoder_z_loss: torch.FloatTensor = None - encoder_aux_loss: torch.FloatTensor = None - decoder_aux_loss: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None + encoder_z_loss: Optional[torch.FloatTensor] = None + decoder_z_loss: Optional[torch.FloatTensor] = None + encoder_aux_loss: Optional[torch.FloatTensor] = None + decoder_aux_loss: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -979,7 +979,7 @@ class NextSentencePredictorOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1008,7 +1008,7 @@ class SequenceClassifierOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1063,7 +1063,7 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1099,7 +1099,7 @@ class MultipleChoiceModelOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1128,7 +1128,7 @@ class TokenClassifierOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1159,8 +1159,8 @@ class QuestionAnsweringModelOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - start_logits: torch.FloatTensor = None - end_logits: torch.FloatTensor = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1217,8 +1217,8 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - start_logits: torch.FloatTensor = None - end_logits: torch.FloatTensor = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1261,7 +1261,7 @@ class SemanticSegmenterOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1289,7 +1289,7 @@ class ImageClassifierOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1311,7 +1311,7 @@ class ImageClassifierOutputWithNoAttention(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1340,7 +1340,7 @@ class DepthEstimatorOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - predicted_depth: torch.FloatTensor = None + predicted_depth: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1368,7 +1368,7 @@ class ImageSuperResolutionOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - reconstruction: torch.FloatTensor = None + reconstruction: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1396,8 +1396,8 @@ class Wav2Vec2BaseModelOutput(ModelOutput): heads. """ - last_hidden_state: torch.FloatTensor = None - extract_features: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None + extract_features: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1428,8 +1428,8 @@ class XVectorOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - embeddings: torch.FloatTensor = None + logits: Optional[torch.FloatTensor] = None + embeddings: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1456,7 +1456,7 @@ class BackboneOutput(ModelOutput): heads. """ - feature_maps: Tuple[torch.FloatTensor] = None + feature_maps: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1491,8 +1491,8 @@ class BaseModelOutputWithPoolingAndProjection(ModelOutput): Text embeddings before the projection layer, used to mimic the last hidden state of the teacher encoder. """ - last_hidden_state: torch.FloatTensor = None - pooler_output: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None projection_state: Optional[Tuple[torch.FloatTensor]] = None @@ -1548,7 +1548,7 @@ class Seq2SeqSpectrogramOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - spectrogram: torch.FloatTensor = None + spectrogram: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1617,7 +1617,7 @@ class Seq2SeqTSModelOutput(ModelOutput): Static features of each time series' in a batch which are copied to the covariates at inference time. """ - last_hidden_state: torch.FloatTensor = None + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1713,7 +1713,7 @@ class SampleTSPredictionOutput(ModelOutput): Sampled values from the chosen distribution. """ - sequences: torch.FloatTensor = None + sequences: Optional[torch.FloatTensor] = None @dataclass @@ -1739,7 +1739,7 @@ class MaskedImageModelingOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - reconstruction: torch.FloatTensor = None + reconstruction: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None diff --git a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py index a7b8cfc782..92ddcc88d4 100644 --- a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py +++ b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py @@ -19,7 +19,7 @@ import json import os import re from os import path -from typing import Dict, Union +from typing import Dict, Optional, Union import torch from huggingface_hub import split_torch_state_dict_into_shards @@ -172,7 +172,7 @@ def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( mamba_ssm_checkpoint_path: str, precision: str, output_dir: str, - tokenizer_path: str = None, + tokenizer_path: Optional[str] = None, save_model: Union[bool, str] = True, ) -> None: # load tokenizer if provided, this will be used to set the diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 7d1bf21178..e50b25b18f 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -175,7 +175,7 @@ class BarkProcessor(ProcessorMixin): super().save_pretrained(save_directory, push_to_hub, **kwargs) - def _load_voice_preset(self, voice_preset: str = None, **kwargs): + def _load_voice_preset(self, voice_preset: Optional[str] = None, **kwargs): voice_preset_paths = self.speaker_embeddings[voice_preset] voice_preset_dict = {} diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py index 5afda9c1ee..6cb31b2eae 100644 --- a/src/transformers/models/big_bird/modeling_flax_big_bird.py +++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py @@ -412,7 +412,7 @@ class FlaxBigBirdSelfAttention(nn.Module): class FlaxBigBirdBlockSparseAttention(nn.Module): config: BigBirdConfig - block_sparse_seed: int = None + block_sparse_seed: Optional[int] = None dtype: jnp.dtype = jnp.float32 def setup(self): @@ -1262,7 +1262,7 @@ class FlaxBigBirdSelfOutput(nn.Module): class FlaxBigBirdAttention(nn.Module): config: BigBirdConfig - layer_id: int = None + layer_id: Optional[int] = None causal: bool = False dtype: jnp.dtype = jnp.float32 @@ -1362,7 +1362,7 @@ class FlaxBigBirdOutput(nn.Module): class FlaxBigBirdLayer(nn.Module): config: BigBirdConfig - layer_id: int = None + layer_id: Optional[int] = None dtype: jnp.dtype = jnp.float32 # the dtype of the computation def setup(self): diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py index 0250bc3576..abc0954c6a 100644 --- a/src/transformers/models/bit/image_processing_bit.py +++ b/src/transformers/models/bit/image_processing_bit.py @@ -180,7 +180,7 @@ class BitImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/chameleon/image_processing_chameleon.py b/src/transformers/models/chameleon/image_processing_chameleon.py index c9d110ad22..e4b8f87b55 100644 --- a/src/transformers/models/chameleon/image_processing_chameleon.py +++ b/src/transformers/models/chameleon/image_processing_chameleon.py @@ -176,7 +176,7 @@ class ChameleonImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py index e07c87dc34..476feaef0d 100644 --- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py @@ -169,7 +169,7 @@ class ChineseCLIPImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py index 6a407cdb87..c4a4428f7b 100644 --- a/src/transformers/models/clap/feature_extraction_clap.py +++ b/src/transformers/models/clap/feature_extraction_clap.py @@ -92,7 +92,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor): return_attention_mask=False, # pad inputs to max length with silence token (zero) and no attention mask frequency_min: float = 0, frequency_max: float = 14_000, - top_db: int = None, + top_db: Optional[int] = None, truncation: str = "fusion", padding: str = "repeatpad", **kwargs, @@ -258,7 +258,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor): def __call__( self, raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]], - truncation: str = None, + truncation: Optional[str] = None, padding: Optional[str] = None, max_length: Optional[int] = None, sampling_rate: Optional[int] = None, diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py index 2155b306bc..4a42c8f9ac 100644 --- a/src/transformers/models/clip/image_processing_clip.py +++ b/src/transformers/models/clip/image_processing_clip.py @@ -204,7 +204,7 @@ class CLIPImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index b806eea5e6..6aebd11dbe 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -1360,7 +1360,7 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel): def get_conditional_embeddings( self, - batch_size: int = None, + batch_size: Optional[int] = None, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py index f05f5d35bf..48e213693b 100644 --- a/src/transformers/models/dac/modeling_dac.py +++ b/src/transformers/models/dac/modeling_dac.py @@ -287,7 +287,7 @@ class DacResidualVectorQuantize(nn.Module): self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)]) self.quantizer_dropout = quantizer_dropout - def forward(self, hidden_state, n_quantizers: int = None): + def forward(self, hidden_state, n_quantizers: Optional[int] = None): """ Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors. Args: @@ -608,7 +608,7 @@ class DacModel(DacPreTrainedModel): def encode( self, input_values: torch.Tensor, - n_quantizers: int = None, + n_quantizers: Optional[int] = None, return_dict: Optional[bool] = None, ): """ @@ -681,7 +681,7 @@ class DacModel(DacPreTrainedModel): def forward( self, input_values: torch.Tensor, - n_quantizers: int = None, + n_quantizers: Optional[int] = None, return_dict: Optional[bool] = None, ): """ diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index 4440cc2e1c..e87c855be5 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -462,7 +462,7 @@ class SPMTokenizer: return ["".join(x) for x in output] - def save_pretrained(self, path: str, filename_prefix: str = None): + def save_pretrained(self, path: str, filename_prefix: Optional[str] = None): filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]] if filename_prefix is not None: filename = filename_prefix + "-" + filename diff --git a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py index d1503f661d..f99ac6c324 100644 --- a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py +++ b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py @@ -182,7 +182,7 @@ class EfficientFormerImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, diff --git a/src/transformers/models/deprecated/graphormer/configuration_graphormer.py b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py index 058ef9d03a..e82eaa75b9 100644 --- a/src/transformers/models/deprecated/graphormer/configuration_graphormer.py +++ b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py @@ -14,6 +14,8 @@ # limitations under the License. """Graphormer model configuration""" +from typing import Optional + from ....configuration_utils import PretrainedConfig from ....utils import logging @@ -159,8 +161,8 @@ class GraphormerConfig(PretrainedConfig): traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, - kdim: int = None, - vdim: int = None, + kdim: Optional[int] = None, + vdim: Optional[int] = None, bias: bool = True, self_attention: bool = True, pad_token_id=0, diff --git a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py index 53dec63cfc..ac4b6d7a13 100644 --- a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py @@ -162,7 +162,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): lower_case=False, delimiter=None, vocab_file=None, - pretrained_vocab_file: str = None, + pretrained_vocab_file: Optional[str] = None, never_split=None, unk_token="", eos_token="", diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py index 46c6b0c7ca..bde5830a51 100644 --- a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py +++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py @@ -280,7 +280,7 @@ class TvltImageProcessor(BaseImageProcessor): do_resize: bool = None, size: Dict[str, int] = None, patch_size: List[int] = None, - num_frames: int = None, + num_frames: Optional[int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, crop_size: Dict[str, int] = None, diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py index 51466e77ba..466b14f6ba 100644 --- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py +++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py @@ -22,7 +22,7 @@ import sys from dataclasses import dataclass, field from functools import partial from pathlib import Path -from typing import List +from typing import List, Optional import torch import torch.nn as nn @@ -163,7 +163,7 @@ def convert_weight_and_push( print(f"Pushed {checkpoint_name}") -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): +def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 diff --git a/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py index 0424fd058e..2dbb1d0202 100644 --- a/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py +++ b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py @@ -196,7 +196,7 @@ class ViTHybridImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 3c2162409c..72d77edf9a 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -161,7 +161,7 @@ class DPTImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = False, - size_divisor: int = None, + size_divisor: Optional[int] = None, do_reduce_labels: bool = False, **kwargs, ) -> None: @@ -299,14 +299,14 @@ class DPTImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, keep_aspect_ratio: bool = None, - ensure_multiple_of: int = None, + ensure_multiple_of: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ): if do_reduce_labels: @@ -340,14 +340,14 @@ class DPTImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, keep_aspect_ratio: bool = None, - ensure_multiple_of: int = None, + ensure_multiple_of: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: @@ -391,7 +391,7 @@ class DPTImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, keep_aspect_ratio: bool = None, - ensure_multiple_of: int = None, + ensure_multiple_of: Optional[int] = None, do_reduce_labels: bool = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ): @@ -437,9 +437,9 @@ class DPTImageProcessor(BaseImageProcessor): images: ImageInput, segmentation_maps: Optional[ImageInput] = None, do_resize: bool = None, - size: int = None, + size: Optional[int] = None, keep_aspect_ratio: bool = None, - ensure_multiple_of: int = None, + ensure_multiple_of: Optional[int] = None, resample: PILImageResampling = None, do_rescale: bool = None, rescale_factor: float = None, @@ -447,7 +447,7 @@ class DPTImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, do_reduce_labels: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index decc4f8df0..2cba3bd8a2 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -398,8 +398,8 @@ class EncoderDecoderModel(PreTrainedModel, GenerationMixin): @classmethod def from_encoder_decoder_pretrained( cls, - encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> PreTrainedModel: diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py index 66009fc3ef..a5abafc361 100644 --- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py @@ -311,8 +311,8 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): @classmethod def from_encoder_decoder_pretrained( cls, - encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> TFPreTrainedModel: diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py index c0a31e6958..ac56bc8d78 100644 --- a/src/transformers/models/esm/configuration_esm.py +++ b/src/transformers/models/esm/configuration_esm.py @@ -172,7 +172,7 @@ class EsmConfig(PretrainedConfig): @dataclass class EsmFoldConfig: - esm_type: str = None + esm_type: Optional[str] = None fp16_esm: bool = True use_esm_attn_map: bool = False esm_ablate_pairwise: bool = False diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py index 7ed2302ef5..960c8189ae 100644 --- a/src/transformers/models/flava/image_processing_flava.py +++ b/src/transformers/models/flava/image_processing_flava.py @@ -249,7 +249,7 @@ class FlavaImageProcessor(BaseImageProcessor): codebook_size: bool = None, codebook_resample: int = PILImageResampling.LANCZOS, codebook_do_center_crop: bool = True, - codebook_crop_size: int = None, + codebook_crop_size: Optional[int] = None, codebook_do_rescale: bool = True, codebook_rescale_factor: Union[int, float] = 1 / 255, codebook_do_map_pixels: bool = True, diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py index ddf29a3331..e8d6e87243 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3.py +++ b/src/transformers/models/gemma3/image_processing_gemma3.py @@ -104,8 +104,8 @@ class Gemma3ImageProcessor(BaseImageProcessor): image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = None, do_pan_and_scan: bool = None, - pan_and_scan_min_crop_size: int = None, - pan_and_scan_max_num_crops: int = None, + pan_and_scan_min_crop_size: Optional[int] = None, + pan_and_scan_max_num_crops: Optional[int] = None, pan_and_scan_min_ratio_to_activate: float = None, **kwargs, ) -> None: @@ -253,8 +253,8 @@ class Gemma3ImageProcessor(BaseImageProcessor): input_data_format: Optional[Union[str, ChannelDimension]] = None, do_convert_rgb: bool = None, do_pan_and_scan: bool = None, - pan_and_scan_min_crop_size: int = None, - pan_and_scan_max_num_crops: int = None, + pan_and_scan_min_crop_size: Optional[int] = None, + pan_and_scan_max_num_crops: Optional[int] = None, pan_and_scan_min_ratio_to_activate: float = None, ) -> PIL.Image.Image: """ diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index da18b35d48..4255a1ebb3 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -509,7 +509,7 @@ class IdeficsAttention(nn.Module): is_cross_attention: bool = False, config: PretrainedConfig = None, qk_layer_norms: bool = False, - layer_idx: int = None, + layer_idx: Optional[int] = None, ): super().__init__() self.hidden_size = hidden_size @@ -675,7 +675,7 @@ class IdeficsAttention(nn.Module): # this was adapted from LlamaDecoderLayer class IdeficsDecoderLayer(nn.Module): - def __init__(self, config: IdeficsConfig, layer_idx: int = None): + def __init__(self, config: IdeficsConfig, layer_idx: Optional[int] = None): super().__init__() self.hidden_size = config.hidden_size self.self_attn = IdeficsAttention( @@ -754,7 +754,7 @@ class IdeficsDecoderLayer(nn.Module): class IdeficsGatedCrossAttentionLayer(nn.Module): - def __init__(self, config: IdeficsConfig, layer_idx: int = None): + def __init__(self, config: IdeficsConfig, layer_idx: Optional[int] = None): super().__init__() self.hidden_size = config.hidden_size self.cross_attn = IdeficsAttention( diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 9502f1e958..c69945e282 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -89,7 +89,9 @@ class Idefics2Processor(ProcessorMixin): image_processor_class = "Idefics2ImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: str = None, **kwargs): + def __init__( + self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs + ): if image_processor is None: raise ValueError("You need to specify an `image_processor`.") if tokenizer is None: diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 6501fca6b6..0f1cf7b248 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -133,7 +133,9 @@ class Idefics3Processor(ProcessorMixin): image_processor_class = "Idefics3ImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: str = None, **kwargs): + def __init__( + self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs + ): if image_processor is None: raise ValueError("You need to specify an `image_processor`.") if tokenizer is None: diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py index afef3f73de..0d5731bf7b 100644 --- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py +++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py @@ -19,6 +19,7 @@ import json from collections import OrderedDict from functools import partial from pathlib import Path +from typing import Optional import timm import torch @@ -79,7 +80,7 @@ def convert_weight_and_push( print(f"Pushed {checkpoint_name}") -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): +def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 expected_shape = (1, num_labels) diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index d338775df6..ff8b36c452 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -333,7 +333,7 @@ class LlavaNextImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, @@ -563,7 +563,7 @@ class LlavaNextImageProcessor(BaseImageProcessor): image_grid_pinpoints: List = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 3ec8d9db06..9aa09e9673 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -183,7 +183,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, @@ -283,7 +283,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 6797fb48bc..b8ab958e61 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -577,7 +577,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): image: ImageInput, do_resize: bool = None, size: Dict[str, int] = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, resample: PILImageResampling = None, do_rescale: bool = None, rescale_factor: float = None, @@ -601,7 +601,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): image: ImageInput, do_resize: bool = None, size: Dict[str, int] = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, resample: PILImageResampling = None, do_rescale: bool = None, rescale_factor: float = None, diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index ca9377e282..6d35d69f5f 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -1592,7 +1592,7 @@ class Mask2FormerMaskedAttentionDecoderLayer(nn.Module): def forward_post( self, hidden_states: torch.Tensor, - level_index: int = None, + level_index: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, position_embeddings: Optional[torch.Tensor] = None, query_position_embeddings: Optional[torch.Tensor] = None, @@ -1651,7 +1651,7 @@ class Mask2FormerMaskedAttentionDecoderLayer(nn.Module): def forward_pre( self, hidden_states: torch.Tensor, - level_index: int = None, + level_index: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, position_embeddings: Optional[torch.Tensor] = None, query_position_embeddings: Optional[torch.Tensor] = None, @@ -1712,7 +1712,7 @@ class Mask2FormerMaskedAttentionDecoderLayer(nn.Module): def forward( self, hidden_states: torch.Tensor, - level_index: int = None, + level_index: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, position_embeddings: Optional[torch.Tensor] = None, query_position_embeddings: Optional[torch.Tensor] = None, @@ -2013,7 +2013,9 @@ class Mask2FormerMaskPredictor(nn.Module): self.mask_embedder = Mask2FormerMLPPredictionHead(self.hidden_size, self.hidden_size, mask_feature_size) - def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: int = None): + def forward( + self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: Optional[int] = None + ): mask_embeddings = self.mask_embedder(outputs.transpose(0, 1)) is_tracing = torch.jit.is_tracing() or isinstance(outputs, torch.fx.Proxy) or is_torchdynamo_compiling() diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 87a3063a37..532bbaffdd 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -578,7 +578,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): image: ImageInput, do_resize: bool = None, size: Dict[str, int] = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, resample: PILImageResampling = None, do_rescale: bool = None, rescale_factor: float = None, @@ -602,7 +602,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): image: ImageInput, do_resize: bool = None, size: Dict[str, int] = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, resample: PILImageResampling = None, do_rescale: bool = None, rescale_factor: float = None, diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 4539cfd0bd..b77203ca0e 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1316,7 +1316,7 @@ class MimiVectorQuantization(nn.Module): class MimiResidualVectorQuantizer(nn.Module): """Residual Vector Quantizer.""" - def __init__(self, config: MimiConfig, num_quantizers: int = None): + def __init__(self, config: MimiConfig, num_quantizers: Optional[int] = None): super().__init__() self.codebook_size = config.codebook_size self.frame_rate = config.frame_rate diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index fc4770fd8f..e2ccedb0c6 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -437,7 +437,7 @@ class MoonshineEncoderLayer(nn.Module): class MoonshineDecoderLayer(nn.Module): - def __init__(self, config: MoonshineConfig, layer_idx: int = None): + def __init__(self, config: MoonshineConfig, layer_idx: Optional[int] = None): super().__init__() self.hidden_size = config.hidden_size diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 24fa4f0a1e..db071b526e 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -427,7 +427,7 @@ class MoonshineEncoderLayer(LlamaDecoderLayer): class MoonshineDecoderLayer(nn.Module): - def __init__(self, config: MoonshineConfig, layer_idx: int = None): + def __init__(self, config: MoonshineConfig, layer_idx: Optional[int] = None): super().__init__() self.hidden_size = config.hidden_size diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 3d686ba34d..193f43054c 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -420,7 +420,7 @@ class MoshiGatingMLP(nn.Module): self.fc1 = MoshiFlexibleLinear(hidden_size, ffn_dim, num_layers) self.fc2 = MoshiFlexibleLinear(ffn_dim // 2, hidden_size, num_layers) - def forward(self, hidden_states: torch.Tensor, layer_idx: int = None) -> torch.Tensor: + def forward(self, hidden_states: torch.Tensor, layer_idx: Optional[int] = None) -> torch.Tensor: hidden_states = self.fc1(hidden_states) if layer_idx is None else self.fc1(hidden_states, layer_idx) batch_size, sequence_length, _ = hidden_states.shape @@ -2644,7 +2644,7 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin): return input_ids def build_delay_pattern_mask( - self, input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: int = None + self, input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: Optional[int] = None ): """Build a delayed pattern mask to the input_ids. Each codebook, except the first one, is offset by one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index bd62e6add8..e424845245 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1377,7 +1377,9 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin): "use_cache": use_cache, } - def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int = None): + def build_delay_pattern_mask( + self, input_ids: torch.LongTensor, pad_token_id: int, max_length: Optional[int] = None + ): """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks, @@ -1828,9 +1830,9 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin): @classmethod def from_sub_models_pretrained( cls, - text_encoder_pretrained_model_name_or_path: str = None, - audio_encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + text_encoder_pretrained_model_name_or_path: Optional[str] = None, + audio_encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> PreTrainedModel: @@ -2232,8 +2234,8 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin): batch_size: int, model_input_name: str, model_kwargs: Dict[str, torch.Tensor], - decoder_start_token_id: int = None, - bos_token_id: int = None, + decoder_start_token_id: Optional[int] = None, + bos_token_id: Optional[int] = None, device: torch.device = None, ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]: """Prepares `decoder_input_ids` for generation with encoder-decoder models""" @@ -2454,7 +2456,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin): return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id def _get_decoder_start_token_id( - self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None + self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: Optional[int] = None ) -> int: decoder_start_token_id = ( decoder_start_token_id diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 29d2da1540..ec6074f48a 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1297,7 +1297,9 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin): "use_cache": use_cache, } - def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int = None): + def build_delay_pattern_mask( + self, input_ids: torch.LongTensor, pad_token_id: int, max_length: Optional[int] = None + ): """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks, @@ -1706,9 +1708,9 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin): # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenForConditionalGeneration.from_sub_models_pretrained with Musicgen->MusicgenMelody, musicgen-small->musicgen-melody def from_sub_models_pretrained( cls, - text_encoder_pretrained_model_name_or_path: str = None, - audio_encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + text_encoder_pretrained_model_name_or_path: Optional[str] = None, + audio_encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> PreTrainedModel: @@ -2112,8 +2114,8 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin): batch_size: int, model_input_name: str, model_kwargs: Dict[str, torch.Tensor], - decoder_start_token_id: int = None, - bos_token_id: int = None, + decoder_start_token_id: Optional[int] = None, + bos_token_id: Optional[int] = None, device: torch.device = None, ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]: """Prepares `decoder_input_ids` for generation with encoder-decoder models""" @@ -2304,7 +2306,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin): # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenForConditionalGeneration._get_decoder_start_token_id def _get_decoder_start_token_id( - self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None + self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: Optional[int] = None ) -> int: decoder_start_token_id = ( decoder_start_token_id diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py index b4bc76f613..e5dc6ed164 100644 --- a/src/transformers/models/nougat/tokenization_nougat_fast.py +++ b/src/transformers/models/nougat/tokenization_nougat_fast.py @@ -19,7 +19,7 @@ Fast tokenizer class for Nougat. import re from functools import partial from multiprocessing import Pool -from typing import List, Union +from typing import List, Optional, Union import numpy as np @@ -584,7 +584,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast): self, generation: Union[str, List[str]], fix_markdown: bool = True, - num_workers: int = None, + num_workers: Optional[int] = None, ) -> Union[str, List[str]]: """ Postprocess a generated text or a list of generated texts. diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index f931ab3109..cbce106f33 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -440,7 +440,7 @@ class OneFormerImageProcessor(BaseImageProcessor): ignore_index: Optional[int] = None, do_reduce_labels: bool = False, repo_path: Optional[str] = "shi-labs/oneformer_demo", - class_info_file: str = None, + class_info_file: Optional[str] = None, num_text: Optional[int] = None, num_labels: Optional[int] = None, **kwargs, diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index c13f43f696..5ab6efd7dd 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -105,7 +105,7 @@ class OPTAttention(nn.Module): def __init__( self, config: OPTConfig, - layer_idx: int = None, + layer_idx: Optional[int] = None, **kwargs, ): super().__init__() @@ -369,7 +369,7 @@ OPT_ATTENTION_CLASSES = { class OPTDecoderLayer(nn.Module): - def __init__(self, config: OPTConfig, layer_idx: int = None): + def __init__(self, config: OPTConfig, layer_idx: Optional[int] = None): super().__init__() self.embed_dim = config.hidden_size diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py index 624f1a6f40..61061ec1f5 100644 --- a/src/transformers/models/poolformer/image_processing_poolformer.py +++ b/src/transformers/models/poolformer/image_processing_poolformer.py @@ -215,7 +215,7 @@ class PoolFormerImageProcessor(BaseImageProcessor): images: ImageInput, do_resize: bool = None, size: Dict[str, int] = None, - crop_pct: int = None, + crop_pct: Optional[int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, crop_size: Dict[str, int] = None, diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py index b4fdea0d4c..898475835b 100644 --- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py @@ -152,7 +152,7 @@ class PromptDepthAnythingImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = False, - size_divisor: int = None, + size_divisor: Optional[int] = None, prompt_scale_to_meter: float = 0.001, # default unit is mm **kwargs, ): diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 671cd86170..81136cb00a 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -132,8 +132,8 @@ class Qwen2VLImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = True, - min_pixels: int = None, - max_pixels: int = None, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, patch_size: int = 14, temporal_patch_size: int = 2, merge_size: int = 2, @@ -177,9 +177,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor): do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - patch_size: int = None, - temporal_patch_size: int = None, - merge_size: int = None, + patch_size: Optional[int] = None, + temporal_patch_size: Optional[int] = None, + merge_size: Optional[int] = None, do_convert_rgb: bool = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, @@ -304,17 +304,17 @@ class Qwen2VLImageProcessor(BaseImageProcessor): videos: VideoInput = None, do_resize: bool = None, size: Dict[str, int] = None, - min_pixels: int = None, - max_pixels: int = None, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, resample: PILImageResampling = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - patch_size: int = None, - temporal_patch_size: int = None, - merge_size: int = None, + patch_size: Optional[int] = None, + temporal_patch_size: Optional[int] = None, + merge_size: Optional[int] = None, do_convert_rgb: bool = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index 8d92cb0845..21084b1dd3 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -263,11 +263,11 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - min_pixels: int = None, - max_pixels: int = None, - patch_size: int = None, - temporal_patch_size: int = None, - merge_size: int = None, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + patch_size: Optional[int] = None, + temporal_patch_size: Optional[int] = None, + merge_size: Optional[int] = None, do_convert_rgb: bool = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index b6faa95454..9d6664e1eb 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -245,8 +245,8 @@ class RagPreTrainedModel(PreTrainedModel): @classmethod def from_pretrained_question_encoder_generator( cls, - question_encoder_pretrained_model_name_or_path: str = None, - generator_pretrained_model_name_or_path: str = None, + question_encoder_pretrained_model_name_or_path: Optional[str] = None, + generator_pretrained_model_name_or_path: Optional[str] = None, retriever: RagRetriever = None, **kwargs, ) -> PreTrainedModel: diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index babc839610..5c27ad4aaf 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -232,8 +232,8 @@ class TFRagPreTrainedModel(TFPreTrainedModel): @classmethod def from_pretrained_question_encoder_generator( cls, - question_encoder_pretrained_model_name_or_path: str = None, - generator_pretrained_model_name_or_path: str = None, + question_encoder_pretrained_model_name_or_path: Optional[str] = None, + generator_pretrained_model_name_or_path: Optional[str] = None, retriever: RagRetriever = None, *model_args, **kwargs, diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py index 4d0a994e76..428b5f6644 100644 --- a/src/transformers/models/rag/tokenization_rag.py +++ b/src/transformers/models/rag/tokenization_rag.py @@ -81,7 +81,7 @@ class RagTokenizer: max_length: Optional[int] = None, max_target_length: Optional[int] = None, padding: str = "longest", - return_tensors: str = None, + return_tensors: Optional[str] = None, truncation: bool = True, **kwargs, ) -> BatchEncoding: diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py index 280b52a1b9..95bd5b8542 100644 --- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py +++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py @@ -25,7 +25,7 @@ from dataclasses import dataclass, field from functools import partial from pathlib import Path from pprint import pprint -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn @@ -159,7 +159,7 @@ def get_from_to_our_keys(model_name: str) -> Dict[str, str]: return from_to_ours_keys -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): +def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py index 38158b682c..9544400416 100644 --- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py +++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py @@ -19,7 +19,7 @@ import json from dataclasses import dataclass, field from functools import partial from pathlib import Path -from typing import Callable, Dict, List, Tuple +from typing import Callable, Dict, List, Optional, Tuple import timm import torch @@ -218,7 +218,7 @@ def convert_weight_and_push( print(f"Pushed {name}") -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): +def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 expected_shape = (1, num_labels) diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py index feceb74d16..4909f1dc67 100644 --- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py +++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py @@ -19,7 +19,7 @@ import json from dataclasses import dataclass, field from functools import partial from pathlib import Path -from typing import List +from typing import List, Optional import timm import torch @@ -122,7 +122,7 @@ def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Pat print(f"Pushed {checkpoint_name}") -def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): +def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 expected_shape = (1, num_labels) diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py index 0d2e9bb6b9..f0f38a48e5 100644 --- a/src/transformers/models/roc_bert/tokenization_roc_bert.py +++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py @@ -770,8 +770,8 @@ class RoCBertTokenizer(PreTrainedTokenizer): self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, - cls_token_id: int = None, - sep_token_id: int = None, + cls_token_id: Optional[int] = None, + sep_token_id: Optional[int] = None, ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py index 903a0cbf13..c80a8a290b 100644 --- a/src/transformers/models/sam/image_processing_sam.py +++ b/src/transformers/models/sam/image_processing_sam.py @@ -127,8 +127,8 @@ class SamImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = True, - pad_size: int = None, - mask_pad_size: int = None, + pad_size: Optional[int] = None, + mask_pad_size: Optional[int] = None, do_convert_rgb: bool = True, **kwargs, ) -> None: diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index 9c65a3b532..bad72cdf2b 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -325,8 +325,8 @@ class TFSegformerMixFFN(keras.layers.Layer): self, config: SegformerConfig, in_features: int, - hidden_features: int = None, - out_features: int = None, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index 4ff30ffb23..0d66fb9d5f 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -52,7 +52,7 @@ class SiglipProcessor(ProcessorMixin): images: ImageInput = None, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = None, - max_length: int = None, + max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, ) -> BatchFeature: """ diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index f5a1ff57a9..ddc72894af 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -141,7 +141,9 @@ class SmolVLMProcessor(ProcessorMixin): image_processor_class = "SmolVLMImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: str = None, **kwargs): + def __init__( + self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs + ): self.fake_image_token = getattr(tokenizer, "fake_image_token", "") self.image_token = getattr(tokenizer, "image_token", "") self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "") diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 9fa099d192..4375f56a87 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -291,8 +291,8 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): @classmethod def from_encoder_decoder_pretrained( cls, - encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> PreTrainedModel: diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 0e536aee27..b290b4990d 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -247,8 +247,8 @@ class TapasTokenizer(PreTrainedTokenizer): tokenize_chinese_chars=True, strip_accents=None, cell_trim_length: int = -1, - max_column_id: int = None, - max_row_id: int = None, + max_column_id: Optional[int] = None, + max_row_id: Optional[int] = None, strip_column_names: bool = False, update_answer_coordinates: bool = False, min_question_length=None, @@ -2242,8 +2242,8 @@ class NumericValue: @dataclass class NumericValueSpan: - begin_index: int = None - end_index: int = None + begin_index: Optional[int] = None + end_index: Optional[int] = None values: List[NumericValue] = None diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py index 945ebe63fb..1f56d60449 100644 --- a/src/transformers/models/textnet/image_processing_textnet.py +++ b/src/transformers/models/textnet/image_processing_textnet.py @@ -205,10 +205,10 @@ class TextNetImageProcessor(BaseImageProcessor): images: ImageInput, do_resize: bool = None, size: Dict[str, int] = None, - size_divisor: int = None, + size_divisor: Optional[int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 2a745516c4..fcc0de5b6d 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -144,8 +144,8 @@ class TrOCRAttention(nn.Module): config, embed_dim: int, num_heads: int, - kdim: int = None, - vdim: int = None, + kdim: Optional[int] = None, + vdim: Optional[int] = None, dropout: float = 0.0, is_decoder: bool = False, bias: bool = True, diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index dbb1054857..c572551579 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -178,7 +178,7 @@ class VideoLlavaImageProcessor(BaseImageProcessor): size: Dict[str, int] = None, resample: PILImageResampling = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_rescale: bool = None, rescale_factor: float = None, do_normalize: bool = None, @@ -332,7 +332,7 @@ class VideoLlavaImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_center_crop: bool = None, - crop_size: int = None, + crop_size: Optional[int] = None, do_convert_rgb: bool = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py index 9a027f0478..b909d13665 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py @@ -309,8 +309,8 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos @classmethod def from_encoder_decoder_pretrained( cls, - encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> TFPreTrainedModel: diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 6084c00bd1..9b18306713 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -380,8 +380,8 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): @classmethod def from_encoder_decoder_pretrained( cls, - encoder_pretrained_model_name_or_path: str = None, - decoder_pretrained_model_name_or_path: str = None, + encoder_pretrained_model_name_or_path: Optional[str] = None, + decoder_pretrained_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> PreTrainedModel: diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py index e259041cd9..b12327d8ca 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py @@ -414,8 +414,8 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel): @classmethod def from_vision_text_pretrained( cls, - vision_model_name_or_path: str = None, - text_model_name_or_path: str = None, + vision_model_name_or_path: Optional[str] = None, + text_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> FlaxPreTrainedModel: diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py index bb1808aece..ca88d2fec9 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py @@ -465,8 +465,8 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel): @classmethod def from_vision_text_pretrained( cls, - vision_model_name_or_path: str = None, - text_model_name_or_path: str = None, + vision_model_name_or_path: Optional[str] = None, + text_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> TFPreTrainedModel: diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index d7cceb5d2f..a5d3cad601 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -417,8 +417,8 @@ class VisionTextDualEncoderModel(PreTrainedModel): @classmethod def from_vision_text_pretrained( cls, - vision_model_name_or_path: str = None, - text_model_name_or_path: str = None, + vision_model_name_or_path: Optional[str] = None, + text_model_name_or_path: Optional[str] = None, *model_args, **kwargs, ) -> PreTrainedModel: diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py index 763c1f1bd7..aba8fec7ae 100644 --- a/src/transformers/models/vitpose/configuration_vitpose.py +++ b/src/transformers/models/vitpose/configuration_vitpose.py @@ -14,6 +14,8 @@ # limitations under the License. """VitPose model configuration""" +from typing import Optional + from ...configuration_utils import PretrainedConfig from ...utils import logging from ...utils.backbone_utils import verify_backbone_config_arguments @@ -75,11 +77,11 @@ class VitPoseConfig(PretrainedConfig): def __init__( self, - backbone_config: PretrainedConfig = None, - backbone: str = None, + backbone_config: Optional[PretrainedConfig] = None, + backbone: Optional[str] = None, use_pretrained_backbone: bool = False, use_timm_backbone: bool = False, - backbone_kwargs: dict = None, + backbone_kwargs: Optional[dict] = None, initializer_range: float = 0.02, scale_factor: int = 4, use_simple_decoder: bool = True, diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 727a68f857..be0465d6c6 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -652,7 +652,7 @@ class WhisperEncoderLayer(nn.Module): class WhisperDecoderLayer(nn.Module): - def __init__(self, config: WhisperConfig, layer_idx: int = None): + def __init__(self, config: WhisperConfig, layer_idx: Optional[int] = None): super().__init__() self.embed_dim = config.d_model diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index 7983799ad8..b5dbb49a36 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -377,7 +377,9 @@ class WhisperTokenizer(PreTrainedTokenizer): self.cache[token] = word return word - def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None): + def set_prefix_tokens( + self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: bool = None + ): """ Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to update the prefix tokens as required when fine-tuning. Example: @@ -1276,7 +1278,7 @@ def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language, retu def _combine_tokens_into_words( tokenizer, tokens: List[int], - language: str = None, + language: Optional[str] = None, prepend_punctuations: str = "\"'“¡¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", ): diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index 9a2c652544..5b25def5e4 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -451,7 +451,9 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): return tuple(files) + (normalizer_file,) - def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None): + def set_prefix_tokens( + self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: bool = None + ): """ Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to update the prefix tokens as required when fine-tuning. Example: diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 2a7a13780e..7033102eee 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -679,7 +679,7 @@ class ZambaMambaDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, original_hidden_states: Optional[torch.Tensor] = None, - layer_idx: int = None, + layer_idx: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, causal_mask: Optional[torch.Tensor] = None, past_key_value: Optional[ZambaHybridDynamicCache] = None, @@ -747,7 +747,7 @@ class ZambaHybridLayer(nn.Module): self, hidden_states: torch.Tensor, original_hidden_states: Optional[torch.Tensor] = None, - layer_idx: int = None, + layer_idx: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, causal_mask: Optional[torch.Tensor] = None, past_key_value: Optional[ZambaHybridDynamicCache] = None, diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index 772500b42c..05502d0e4e 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -385,8 +385,8 @@ class Zamba2Attention(nn.Module): self, config: Zamba2Config, layer_idx: Optional[int] = None, - num_fwd_mem_blocks: int = None, - block_id: int = None, + num_fwd_mem_blocks: Optional[int] = None, + block_id: Optional[int] = None, ): super().__init__() self.config = config @@ -560,7 +560,7 @@ class Zamba2MambaMixer(nn.Module): and is why Mamba is called **selective** state spaces) """ - def __init__(self, config: Zamba2Config, layer_idx: int = None): + def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None): super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -983,7 +983,7 @@ class Zamba2MambaMixer(nn.Module): class Zamba2MLP(nn.Module): - def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: int = None): + def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None): """ This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead. @@ -1025,7 +1025,7 @@ class Zamba2MLP(nn.Module): class Zamba2AttentionDecoderLayer(nn.Module): - def __init__(self, config: Zamba2Config, block_id: int = None, layer_idx: Optional[int] = None): + def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None): super().__init__() self.block_id = block_id num_gs = len(config.hybrid_layer_ids) @@ -1099,7 +1099,7 @@ class Zamba2MambaDecoderLayer(nn.Module): self, hidden_states: torch.Tensor, original_hidden_states: Optional[torch.Tensor] = None, - layer_idx: int = None, + layer_idx: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, causal_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Zamba2HybridDynamicCache] = None, @@ -1169,7 +1169,7 @@ class Zamba2HybridLayer(nn.Module): self, hidden_states: torch.Tensor, original_hidden_states: Optional[torch.Tensor] = None, - layer_idx: int = None, + layer_idx: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, causal_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Zamba2HybridDynamicCache] = None, diff --git a/src/transformers/models/zamba2/modular_zamba2.py b/src/transformers/models/zamba2/modular_zamba2.py index f2074b76f3..625cdb0bf5 100644 --- a/src/transformers/models/zamba2/modular_zamba2.py +++ b/src/transformers/models/zamba2/modular_zamba2.py @@ -199,8 +199,8 @@ class Zamba2Attention(ZambaAttention): self, config: Zamba2Config, layer_idx: Optional[int] = None, - num_fwd_mem_blocks: int = None, - block_id: int = None, + num_fwd_mem_blocks: Optional[int] = None, + block_id: Optional[int] = None, ): super().__init__(config, layer_idx) self.num_fwd_mem_blocks = num_fwd_mem_blocks @@ -302,7 +302,7 @@ class Zamba2MambaMixer(nn.Module): and is why Mamba is called **selective** state spaces) """ - def __init__(self, config: Zamba2Config, layer_idx: int = None): + def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None): super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -725,7 +725,7 @@ class Zamba2MambaMixer(nn.Module): class Zamba2MLP(nn.Module): - def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: int = None): + def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None): """ This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead. @@ -767,7 +767,7 @@ class Zamba2MLP(nn.Module): class Zamba2AttentionDecoderLayer(ZambaAttentionDecoderLayer): - def __init__(self, config: Zamba2Config, block_id: int = None, layer_idx: Optional[int] = None): + def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None): self.block_id = block_id num_gs = len(config.hybrid_layer_ids) super().__init__(config, layer_idx) @@ -847,7 +847,7 @@ class Zamba2HybridLayer(ZambaHybridLayer): self, hidden_states: torch.Tensor, original_hidden_states: Optional[torch.Tensor] = None, - layer_idx: int = None, + layer_idx: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, causal_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Zamba2HybridDynamicCache] = None, diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py index f0457d00d9..c93e29c583 100644 --- a/src/transformers/models/zoedepth/image_processing_zoedepth.py +++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py @@ -305,9 +305,9 @@ class ZoeDepthImageProcessor(BaseImageProcessor): image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_resize: bool = None, - size: int = None, + size: Optional[int] = None, keep_aspect_ratio: bool = None, - ensure_multiple_of: int = None, + ensure_multiple_of: Optional[int] = None, resample: PILImageResampling = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py index 1e510b5483..460ee93299 100644 --- a/src/transformers/onnx/config.py +++ b/src/transformers/onnx/config.py @@ -291,7 +291,7 @@ class OnnxConfig(ABC): sampling_rate: int = 22050, time_duration: float = 5.0, frequency: int = 220, - tokenizer: "PreTrainedTokenizerBase" = None, + tokenizer: Optional["PreTrainedTokenizerBase"] = None, ) -> Mapping[str, Any]: """ Generate inputs to provide to the ONNX exporter for the specific framework @@ -445,7 +445,7 @@ class OnnxConfigWithPast(OnnxConfig, ABC): self, config: "PretrainedConfig", task: str = "default", - patching_specs: List[PatchingSpec] = None, + patching_specs: Optional[list[PatchingSpec]] = None, use_past: bool = False, ): super().__init__(config, task=task, patching_specs=patching_specs) @@ -639,7 +639,7 @@ class OnnxSeq2SeqConfigWithPast(OnnxConfigWithPast): def generate_dummy_inputs( self, - tokenizer: "PreTrainedTokenizerBase", + tokenizer: Optional["PreTrainedTokenizerBase"], batch_size: int = -1, seq_length: int = -1, is_pair: bool = False, diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index a73b6b927d..58bc51f8e8 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -16,7 +16,7 @@ import warnings from inspect import signature from itertools import chain from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Tuple, Union +from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union import numpy as np from packaging.version import Version, parse @@ -85,7 +85,7 @@ def export_pytorch( config: OnnxConfig, opset: int, output: Path, - tokenizer: "PreTrainedTokenizer" = None, + tokenizer: Optional["PreTrainedTokenizer"] = None, device: str = "cpu", ) -> Tuple[List[str], List[str]]: """ @@ -188,7 +188,7 @@ def export_tensorflow( config: OnnxConfig, opset: int, output: Path, - tokenizer: "PreTrainedTokenizer" = None, + tokenizer: Optional["PreTrainedTokenizer"] = None, ) -> Tuple[List[str], List[str]]: """ Export a TensorFlow model to an ONNX Intermediate Representation (IR) @@ -254,7 +254,7 @@ def export( config: OnnxConfig, opset: int, output: Path, - tokenizer: "PreTrainedTokenizer" = None, + tokenizer: Optional["PreTrainedTokenizer"] = None, device: str = "cpu", ) -> Tuple[List[str], List[str]]: """ @@ -321,7 +321,7 @@ def validate_model_outputs( onnx_model: Path, onnx_named_outputs: List[str], atol: float, - tokenizer: "PreTrainedTokenizer" = None, + tokenizer: Optional["PreTrainedTokenizer"] = None, ): from onnxruntime import InferenceSession, SessionOptions diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 5875658ca5..6a88acf711 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -531,7 +531,7 @@ class BatchEncoding(UserDict): span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index) return TokenSpan(*span) if span is not None else None - def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: + def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> Optional[CharSpan]: """ Get the character span corresponding to an encoded token in a sequence of the batch. @@ -2629,7 +2629,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, + truncation: Union[bool, str, TruncationStrategy, None] = None, max_length: Optional[int] = None, stride: int = 0, padding_side: Optional[str] = None, @@ -2810,15 +2810,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) def __call__( self, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], None] = None, text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, - text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], None] = None, text_pair_target: Optional[ Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] ] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, + truncation: Union[bool, str, TruncationStrategy, None] = None, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, @@ -2905,7 +2905,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, + truncation: Union[bool, str, TruncationStrategy, None] = None, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, @@ -3131,7 +3131,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ], add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, + truncation: Union[bool, str, TruncationStrategy, None] = None, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, @@ -3807,7 +3807,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = None, + clean_up_tokenization_spaces: Optional[bool] = None, **kwargs, ) -> List[str]: """ @@ -3841,7 +3841,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = None, + clean_up_tokenization_spaces: Optional[bool] = None, **kwargs, ) -> str: """ @@ -3878,7 +3878,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = None, + clean_up_tokenization_spaces: Optional[bool] = None, **kwargs, ) -> str: raise NotImplementedError diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 189f09d3de..17ad614504 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -414,7 +414,7 @@ class Trainer: @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True) def __init__( self, - model: Union[PreTrainedModel, nn.Module] = None, + model: Union[PreTrainedModel, nn.Module, None] = None, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, @@ -2139,7 +2139,7 @@ class Trainer: def train( self, resume_from_checkpoint: Optional[Union[str, bool]] = None, - trial: Union["optuna.Trial", dict[str, Any]] = None, + trial: Union["optuna.Trial", dict[str, Any], None] = None, ignore_keys_for_eval: Optional[list[str]] = None, **kwargs, ): @@ -4920,10 +4920,10 @@ class Trainer: logger.info(f" Num examples = {num_examples}") logger.info(f" Batch size = {batch_size}") - losses_host: torch.Tensor = None - preds_host: Union[torch.Tensor, list[torch.Tensor]] = None - labels_host: Union[torch.Tensor, list[torch.Tensor]] = None - inputs_host: Union[torch.Tensor, list[torch.Tensor]] = None + losses_host: Optional[torch.Tensor] = None + preds_host: Union[torch.Tensor, list[torch.Tensor], None] = None + labels_host: Union[torch.Tensor, list[torch.Tensor], None] = None + inputs_host: Union[torch.Tensor, list[torch.Tensor], None] = None metrics: Optional[dict] = None eval_set_kwargs: dict = {}