From 850cf4af0ce281d2c3e7ebfc12e0bc24a9c40714 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 26 Jun 2023 18:36:47 +0200 Subject: [PATCH] Compute `dropout_probability` only in training mode (#24486) * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh --- .../models/autoformer/modeling_autoformer.py | 16 +++++++++++----- src/transformers/models/bart/modeling_bart.py | 16 +++++++++++----- .../bigbird_pegasus/modeling_bigbird_pegasus.py | 16 +++++++++++----- .../models/biogpt/modeling_biogpt.py | 7 ++++--- .../models/blenderbot/modeling_blenderbot.py | 16 +++++++++++----- .../modeling_blenderbot_small.py | 16 +++++++++++----- .../modeling_conditional_detr.py | 16 +++++++++++----- src/transformers/models/detr/modeling_detr.py | 16 +++++++++++----- .../models/flaubert/modeling_flaubert.py | 7 ++++--- src/transformers/models/fsmt/modeling_fsmt.py | 7 ++++--- .../models/informer/modeling_informer.py | 16 +++++++++++----- src/transformers/models/led/modeling_led.py | 7 ++++--- .../models/marian/modeling_marian.py | 16 +++++++++++----- .../models/maskformer/modeling_maskformer.py | 7 ++++--- src/transformers/models/mbart/modeling_mbart.py | 16 +++++++++++----- src/transformers/models/mvp/modeling_mvp.py | 16 +++++++++++----- src/transformers/models/opt/modeling_opt.py | 7 ++++--- .../models/pegasus/modeling_pegasus.py | 16 +++++++++++----- .../models/pegasus_x/modeling_pegasus_x.py | 16 +++++++++++----- .../models/plbart/modeling_plbart.py | 16 +++++++++++----- .../speech_to_text/modeling_speech_to_text.py | 16 +++++++++++----- .../modeling_speech_to_text_2.py | 7 ++++--- .../modeling_table_transformer.py | 16 +++++++++++----- .../modeling_time_series_transformer.py | 16 +++++++++++----- src/transformers/models/trocr/modeling_trocr.py | 7 ++++--- .../models/whisper/modeling_whisper.py | 16 +++++++++++----- src/transformers/models/xglm/modeling_xglm.py | 7 ++++--- .../autoformer/test_modeling_autoformer.py | 6 +++++- 28 files changed, 239 insertions(+), 118 deletions(-) diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 01c20dc52a..85dfe45ff4 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -1197,8 +1197,13 @@ class AutoformerEncoder(AutoformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1407,9 +1412,10 @@ class AutoformerDecoder(AutoformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 51afe26301..f426956594 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -836,8 +836,13 @@ class BartEncoder(BartPretrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1089,9 +1094,10 @@ class BartDecoder(BartPretrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index e529aec5ec..d7683d6fcf 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1932,8 +1932,13 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -2275,9 +2280,10 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index 40fa81de9c..3e925917cf 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -578,9 +578,10 @@ class BioGptModel(BioGptPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 3fe45ee216..8e582c4fa3 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -766,8 +766,13 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1018,9 +1023,10 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 5365546697..890b47373e 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -764,8 +764,13 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1015,9 +1020,10 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 979cef5b40..e42c4fc3ed 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -1223,8 +1223,13 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: # we add position_embeddings as extra input to the encoder_layer @@ -1377,9 +1382,10 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue if idx == 0: pos_transformation = 1 else: diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 165c98f1e6..3e8925a49d 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -978,8 +978,13 @@ class DetrEncoder(DetrPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: # we add position_embeddings as extra input to the encoder_layer @@ -1117,9 +1122,10 @@ class DetrDecoder(DetrPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue if self.gradient_checkpointing and self.training: diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 11f6f0fb3f..1b04da2410 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -579,9 +579,10 @@ class FlaubertModel(FlaubertPreTrainedModel): attentions = () if output_attentions else None for i in range(self.n_layers): # LayerDrop - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue if output_hidden_states: hidden_states = hidden_states + (tensor,) diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 35d34324c7..255cf91df7 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -793,9 +793,10 @@ class FSMTDecoder(nn.Module): x = x.transpose(0, 1) all_hidden_states += (x,) x = x.transpose(0, 1) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue layer_state = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index 1645cacd3d..543db1d608 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -1204,8 +1204,13 @@ class InformerEncoder(InformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1424,9 +1429,10 @@ class InformerDecoder(InformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 38400590d3..8de14242bf 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2134,9 +2134,10 @@ class LEDDecoder(LEDPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index c1d6a67684..1d1cbe125e 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -777,8 +777,13 @@ class MarianEncoder(MarianPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1023,9 +1028,10 @@ class MarianDecoder(MarianPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 55efe64da3..39b0b4cdd4 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -763,9 +763,10 @@ class DetrDecoder(nn.Module): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue if self.gradient_checkpointing and self.training: diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 8a088b68ab..7bf6b1b37e 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -818,8 +818,13 @@ class MBartEncoder(MBartPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1073,9 +1078,10 @@ class MBartDecoder(MBartPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index a1fca99dad..d135ee558d 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -940,8 +940,13 @@ class MvpEncoder(MvpPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1215,9 +1220,10 @@ class MvpDecoder(MvpPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 92c616bb63..5ad783b92d 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -684,9 +684,10 @@ class OPTDecoder(OPTPreTrainedModel): if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 9565ee0d91..3eac50b327 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -792,8 +792,13 @@ class PegasusEncoder(PegasusPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1073,9 +1078,10 @@ class PegasusDecoder(PegasusPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 661cb85a3b..0763aec360 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -1059,8 +1059,13 @@ class PegasusXEncoder(PegasusXPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1314,9 +1319,10 @@ class PegasusXDecoder(PegasusXPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 2a80ae3d59..30d9bd0ddc 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -797,8 +797,13 @@ class PLBartEncoder(PLBartPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1051,9 +1056,10 @@ class PLBartDecoder(PLBartPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index bca2669ae1..862dcac2ce 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -807,8 +807,13 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1052,9 +1057,10 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index 31e9bc34c9..a04fd82d4b 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -661,9 +661,10 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index d2de059470..2c4458b0ed 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -919,8 +919,13 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: # we add position_embeddings as extra input to the encoder_layer @@ -1061,9 +1066,10 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue if self.gradient_checkpointing and self.training: diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index 477a52a57c..98de5e12b4 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -936,8 +936,13 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1150,9 +1155,10 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index ede83af6ed..3ad4ff1bac 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -693,9 +693,10 @@ class TrOCRDecoder(TrOCRPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index c5e9c94d3f..cffb281083 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -915,8 +915,13 @@ class WhisperEncoder(WhisperPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): # skip the layer + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: @@ -1144,9 +1149,10 @@ class WhisperDecoder(WhisperPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index 19ae63199c..b7172127d9 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -667,9 +667,10 @@ class XGLMModel(XGLMPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = torch.rand([]) - if self.training and (dropout_probability < self.layerdrop): - continue + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue past_key_value = past_key_values[idx] if past_key_values is not None else None diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py index 9f0434689c..ab62d0e395 100644 --- a/tests/models/autoformer/test_modeling_autoformer.py +++ b/tests/models/autoformer/test_modeling_autoformer.py @@ -21,7 +21,7 @@ import unittest from huggingface_hub import hf_hub_download from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor @@ -380,6 +380,10 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa [self.model_tester.num_attention_heads, encoder_seq_length, dim], ) + @is_flaky() + def test_retain_grad_hidden_states_attentions(self): + super().test_retain_grad_hidden_states_attentions() + def prepare_batch(filename="train-batch.pt"): file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")