From 232c898f9fd1eec956e7d3f9fbc78999992c5b4a Mon Sep 17 00:00:00 2001 From: "MS Kim(tony9402)" Date: Fri, 30 Jun 2023 03:17:35 +0900 Subject: [PATCH] Fix annotations (#24582) * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations * fix annotations --- src/transformers/models/autoformer/modeling_autoformer.py | 2 +- src/transformers/models/bart/modeling_tf_bart.py | 6 +++--- .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 2 +- .../models/blenderbot/modeling_tf_blenderbot.py | 6 +++--- .../models/blenderbot_small/modeling_tf_blenderbot_small.py | 6 +++--- .../models/conditional_detr/modeling_conditional_detr.py | 2 +- src/transformers/models/deta/modeling_deta.py | 4 ++-- src/transformers/models/detr/modeling_detr.py | 6 +++--- src/transformers/models/informer/modeling_informer.py | 2 +- src/transformers/models/led/modeling_led.py | 6 +++--- src/transformers/models/led/modeling_tf_led.py | 6 +++--- src/transformers/models/marian/modeling_tf_marian.py | 6 +++--- src/transformers/models/maskformer/modeling_maskformer.py | 4 ++-- src/transformers/models/mbart/modeling_tf_mbart.py | 6 +++--- src/transformers/models/mvp/modeling_mvp.py | 2 +- src/transformers/models/nllb_moe/modeling_nllb_moe.py | 2 +- src/transformers/models/opt/modeling_tf_opt.py | 2 +- src/transformers/models/pegasus/modeling_tf_pegasus.py | 6 +++--- .../models/speech_to_text/modeling_tf_speech_to_text.py | 6 +++--- .../models/speech_to_text_2/modeling_speech_to_text_2.py | 4 ++-- .../models/table_transformer/modeling_table_transformer.py | 6 +++--- src/transformers/models/trocr/modeling_trocr.py | 4 ++-- src/transformers/models/whisper/modeling_tf_whisper.py | 6 +++--- src/transformers/models/xglm/modeling_tf_xglm.py | 4 ++-- .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py | 6 +++--- .../modeling_{{cookiecutter.lowercase_modelname}}.py | 6 +++--- 26 files changed, 59 insertions(+), 59 deletions(-) diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 85dfe45ff4..b3584ea221 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -735,7 +735,7 @@ class AutoformerEncoderLayer(nn.Module): ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index e2555381f4..2199c80a21 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -321,7 +321,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): ) -> tf.Tensor: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -394,11 +394,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`tf.Tensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index fe43c1e68e..a16403d709 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1388,7 +1388,7 @@ class BigBirdPegasusEncoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. output_attentions (`bool`, *optional*): diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index d0e7455037..fdd85a7f87 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -317,7 +317,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): ): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -391,11 +391,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`tf.Tensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index 2e8d2e11ca..09c49bea1b 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -317,7 +317,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): ) -> tf.Tensor: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -391,11 +391,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`tf.Tensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index e42c4fc3ed..fc36732cc4 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -795,7 +795,7 @@ class ConditionalDetrEncoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index b5fe0ea8a8..07e6c535f7 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -851,7 +851,7 @@ class DetaDecoderLayer(nn.Module): """ Args: hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(seq_len, batch, embed_dim)`. + Input to the layer of shape `(batch, seq_len, embed_dim)`. position_embeddings (`torch.FloatTensor`, *optional*): Position embeddings that are added to the queries and keys in the self-attention layer. reference_points (`torch.FloatTensor`, *optional*): @@ -861,7 +861,7 @@ class DetaDecoderLayer(nn.Module): level_start_index (`torch.LongTensor`, *optional*): Level start index. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 3e8925a49d..3476e1e2e7 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -642,7 +642,7 @@ class DetrEncoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. @@ -723,7 +723,7 @@ class DetrDecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. @@ -734,7 +734,7 @@ class DetrDecoderLayer(nn.Module): position embeddings that are added to the queries and keys in the self-attention layer. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index 543db1d608..50919a8afa 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -738,7 +738,7 @@ class InformerEncoderLayer(nn.Module): ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index d98c8d2967..cf9b19d28a 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -962,7 +962,7 @@ class LEDEncoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`torch.FloatTensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size @@ -1040,11 +1040,11 @@ class LEDDecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`torch.FloatTensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index a661f4b703..b508c8432d 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -1181,7 +1181,7 @@ class TFLEDEncoderLayer(tf.keras.layers.Layer): ): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -1256,11 +1256,11 @@ class TFLEDDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`tf.Tensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 9632ddeaac..8e0cecb99b 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -354,7 +354,7 @@ class TFMarianEncoderLayer(tf.keras.layers.Layer): ) -> tf.Tensor: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -428,11 +428,11 @@ class TFMarianDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`tf.Tensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 39b0b4cdd4..98f0cdb504 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -571,7 +571,7 @@ class DetrDecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. @@ -582,7 +582,7 @@ class DetrDecoderLayer(nn.Module): position embeddings that are added to the queries and keys in the self-attention layer. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index b0e2d141f4..04d489ec2c 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -322,7 +322,7 @@ class TFMBartEncoderLayer(tf.keras.layers.Layer): ): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -395,11 +395,11 @@ class TFMBartDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`tf.Tensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 92e393b39e..8982fbf1a1 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -327,7 +327,7 @@ class MvpEncoderLayer(nn.Module): ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 2173145558..f3c597fa3c 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -672,7 +672,7 @@ class NllbMoeEncoderLayer(nn.Module): """ Args: hidden_states (`torch.FloatTensor`): - input to the layer of shape `(seq_len, batch, embed_dim)` + input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index bc76a30ce8..cabed90e91 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -303,7 +303,7 @@ class TFOPTDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`, *optional*): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`, *optional*): mask for attention heads in a given layer of size diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 15c87b938b..52171b884c 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -356,7 +356,7 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer): ): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -430,11 +430,11 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`tf.Tensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 86ec432e49..026d2241b4 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -400,7 +400,7 @@ class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer): ): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -477,11 +477,11 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`tf.Tensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index 822025e40a..e4270bc332 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -345,11 +345,11 @@ class Speech2Text2DecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 2c4458b0ed..df2cf23b65 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -584,7 +584,7 @@ class TableTransformerEncoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. @@ -668,7 +668,7 @@ class TableTransformerDecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. @@ -679,7 +679,7 @@ class TableTransformerDecoderLayer(nn.Module): position embeddings that are added to the queries and keys in the self-attention layer. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index cd4e522bf5..e645230356 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -358,11 +358,11 @@ class TrOCRDecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index c36450dd12..653dd168ae 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -313,7 +313,7 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer): ): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -391,11 +391,11 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`tf.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`tf.Tensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index 639eb912f3..b18c50b795 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -348,11 +348,11 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. encoder_hidden_states (`tf.Tensor`): - cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`tf.Tensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 53057814e3..c59a78a345 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -1794,7 +1794,7 @@ class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer): def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False): """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size @@ -1867,10 +1867,10 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: """ Args: - hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`tf.Tensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. - encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`tf.Tensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index 879100aeaa..426cb5c413 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -1826,7 +1826,7 @@ class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`torch.FloatTensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size @@ -1907,10 +1907,10 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)* attention_mask (`torch.FloatTensor`): attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. - encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)* encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size