Compute dropout_probability only in training mode (#24486)
* fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -1197,8 +1197,13 @@ class AutoformerEncoder(AutoformerPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1407,8 +1412,9 @@ class AutoformerDecoder(AutoformerPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -836,8 +836,13 @@ class BartEncoder(BartPretrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1089,8 +1094,9 @@ class BartDecoder(BartPretrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -1932,8 +1932,13 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -2275,8 +2280,9 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -578,8 +578,9 @@ class BioGptModel(BioGptPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -766,8 +766,13 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1018,8 +1023,9 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -764,8 +764,13 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1015,8 +1020,9 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -1223,8 +1223,13 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
# we add position_embeddings as extra input to the encoder_layer
|
# we add position_embeddings as extra input to the encoder_layer
|
||||||
@@ -1377,8 +1382,9 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
if idx == 0:
|
if idx == 0:
|
||||||
pos_transformation = 1
|
pos_transformation = 1
|
||||||
|
|||||||
@@ -978,8 +978,13 @@ class DetrEncoder(DetrPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
# we add position_embeddings as extra input to the encoder_layer
|
# we add position_embeddings as extra input to the encoder_layer
|
||||||
@@ -1117,8 +1122,9 @@ class DetrDecoder(DetrPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
|
|||||||
@@ -579,8 +579,9 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
|||||||
attentions = () if output_attentions else None
|
attentions = () if output_attentions else None
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
# LayerDrop
|
# LayerDrop
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
|
|||||||
@@ -793,8 +793,9 @@ class FSMTDecoder(nn.Module):
|
|||||||
x = x.transpose(0, 1)
|
x = x.transpose(0, 1)
|
||||||
all_hidden_states += (x,)
|
all_hidden_states += (x,)
|
||||||
x = x.transpose(0, 1)
|
x = x.transpose(0, 1)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
layer_state = past_key_values[idx] if past_key_values is not None else None
|
layer_state = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -1204,8 +1204,13 @@ class InformerEncoder(InformerPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1424,8 +1429,9 @@ class InformerDecoder(InformerPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -2134,8 +2134,9 @@ class LEDDecoder(LEDPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -777,8 +777,13 @@ class MarianEncoder(MarianPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1023,8 +1028,9 @@ class MarianDecoder(MarianPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -763,8 +763,9 @@ class DetrDecoder(nn.Module):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
|
|||||||
@@ -818,8 +818,13 @@ class MBartEncoder(MBartPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1073,8 +1078,9 @@ class MBartDecoder(MBartPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -940,8 +940,13 @@ class MvpEncoder(MvpPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1215,8 +1220,9 @@ class MvpDecoder(MvpPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -684,8 +684,9 @@ class OPTDecoder(OPTPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -792,8 +792,13 @@ class PegasusEncoder(PegasusPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1073,8 +1078,9 @@ class PegasusDecoder(PegasusPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -1059,8 +1059,13 @@ class PegasusXEncoder(PegasusXPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1314,8 +1319,9 @@ class PegasusXDecoder(PegasusXPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -797,8 +797,13 @@ class PLBartEncoder(PLBartPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1051,8 +1056,9 @@ class PLBartDecoder(PLBartPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -807,8 +807,13 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1052,8 +1057,9 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -661,8 +661,9 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -919,8 +919,13 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
# we add position_embeddings as extra input to the encoder_layer
|
# we add position_embeddings as extra input to the encoder_layer
|
||||||
@@ -1061,8 +1066,9 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
|
|||||||
@@ -936,8 +936,13 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1150,8 +1155,9 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -693,8 +693,9 @@ class TrOCRDecoder(TrOCRPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -915,8 +915,13 @@ class WhisperEncoder(WhisperPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
|
to_drop = False
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
if dropout_probability < self.layerdrop: # skip the layer
|
||||||
|
to_drop = True
|
||||||
|
|
||||||
|
if to_drop:
|
||||||
layer_outputs = (None, None)
|
layer_outputs = (None, None)
|
||||||
else:
|
else:
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -1144,8 +1149,9 @@ class WhisperDecoder(WhisperPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -667,8 +667,9 @@ class XGLMModel(XGLMPreTrainedModel):
|
|||||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
if self.training:
|
||||||
dropout_probability = torch.rand([])
|
dropout_probability = torch.rand([])
|
||||||
if self.training and (dropout_probability < self.layerdrop):
|
if dropout_probability < self.layerdrop:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ import unittest
|
|||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
from transformers.testing_utils import require_torch, slow, torch_device
|
from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
|
||||||
|
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
|
||||||
@@ -380,6 +380,10 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
[self.model_tester.num_attention_heads, encoder_seq_length, dim],
|
[self.model_tester.num_attention_heads, encoder_seq_length, dim],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@is_flaky()
|
||||||
|
def test_retain_grad_hidden_states_attentions(self):
|
||||||
|
super().test_retain_grad_hidden_states_attentions()
|
||||||
|
|
||||||
|
|
||||||
def prepare_batch(filename="train-batch.pt"):
|
def prepare_batch(filename="train-batch.pt"):
|
||||||
file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
|
file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
|
||||||
|
|||||||
Reference in New Issue
Block a user