From a28325e25ef399e1a616d67f50ca05abba931163 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Fri, 23 Jun 2023 05:17:21 -0700 Subject: [PATCH] Replace python random with torch.rand to enable dynamo.export (#24434) * Replace python random with torch.rand to enable dynamo.export * revert changes to flax model code * Remove unused random import * Fix torch template * Move torch.manual_seed(0) to right location --- src/transformers/models/autoformer/modeling_autoformer.py | 5 ++--- src/transformers/models/bart/modeling_bart.py | 5 ++--- .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 5 ++--- src/transformers/models/biogpt/modeling_biogpt.py | 3 +-- src/transformers/models/blenderbot/modeling_blenderbot.py | 5 ++--- .../models/blenderbot_small/modeling_blenderbot_small.py | 5 ++--- .../models/conditional_detr/modeling_conditional_detr.py | 5 ++--- src/transformers/models/data2vec/modeling_data2vec_audio.py | 2 +- src/transformers/models/detr/modeling_detr.py | 5 ++--- src/transformers/models/flaubert/modeling_flaubert.py | 3 +-- src/transformers/models/fsmt/modeling_fsmt.py | 5 ++--- src/transformers/models/hubert/modeling_hubert.py | 4 ++-- src/transformers/models/informer/modeling_informer.py | 5 ++--- src/transformers/models/led/modeling_led.py | 5 ++--- src/transformers/models/m2m_100/modeling_m2m_100.py | 5 ++--- src/transformers/models/marian/modeling_marian.py | 5 ++--- src/transformers/models/mask2former/modeling_mask2former.py | 3 +-- src/transformers/models/maskformer/modeling_maskformer.py | 3 +-- src/transformers/models/mbart/modeling_mbart.py | 5 ++--- src/transformers/models/mctct/modeling_mctct.py | 3 +-- src/transformers/models/mvp/modeling_mvp.py | 5 ++--- src/transformers/models/nllb_moe/modeling_nllb_moe.py | 5 ++--- src/transformers/models/opt/modeling_opt.py | 3 +-- src/transformers/models/pegasus/modeling_pegasus.py | 5 ++--- src/transformers/models/pegasus_x/modeling_pegasus_x.py | 5 ++--- src/transformers/models/plbart/modeling_plbart.py | 5 ++--- src/transformers/models/sew/modeling_sew.py | 2 +- .../models/speech_to_text/modeling_speech_to_text.py | 5 ++--- .../models/speech_to_text_2/modeling_speech_to_text_2.py | 3 +-- src/transformers/models/speecht5/modeling_speecht5.py | 5 ++--- .../models/table_transformer/modeling_table_transformer.py | 5 ++--- .../modeling_time_series_transformer.py | 5 ++--- src/transformers/models/trocr/modeling_trocr.py | 3 +-- src/transformers/models/unispeech/modeling_unispeech.py | 4 ++-- .../models/unispeech_sat/modeling_unispeech_sat.py | 4 ++-- src/transformers/models/wav2vec2/modeling_wav2vec2.py | 4 ++-- .../models/wav2vec2_conformer/modeling_wav2vec2_conformer.py | 2 +- src/transformers/models/wavlm/modeling_wavlm.py | 4 ++-- src/transformers/models/whisper/modeling_whisper.py | 5 ++--- src/transformers/models/xglm/modeling_xglm.py | 3 +-- .../modeling_{{cookiecutter.lowercase_modelname}}.py | 5 ++--- tests/generation/test_utils.py | 2 +- 42 files changed, 71 insertions(+), 104 deletions(-) diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 70587add17..01c20dc52a 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -17,7 +17,6 @@ """ PyTorch Autoformer model.""" import math -import random from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -1198,7 +1197,7 @@ class AutoformerEncoder(AutoformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1408,7 +1407,7 @@ class AutoformerDecoder(AutoformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 5045244902..51afe26301 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -15,7 +15,6 @@ """ PyTorch BART model.""" import copy import math -import random import warnings from typing import List, Optional, Tuple, Union @@ -837,7 +836,7 @@ class BartEncoder(BartPretrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1090,7 +1089,7 @@ class BartDecoder(BartPretrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 8d7906631d..e529aec5ec 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -17,7 +17,6 @@ import copy import math -import random from typing import List, Optional, Tuple, Union import numpy as np @@ -1933,7 +1932,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -2276,7 +2275,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index a9ecb11a61..40fa81de9c 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -16,7 +16,6 @@ import math -import random from typing import Optional, Tuple, Union import torch @@ -579,7 +578,7 @@ class BioGptModel(BioGptPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 8f2780772c..3fe45ee216 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -18,7 +18,6 @@ import copy import math import os -import random import warnings from typing import List, Optional, Tuple, Union @@ -767,7 +766,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1019,7 +1018,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index ef8d51a2b0..5365546697 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -17,7 +17,6 @@ import copy import math -import random from typing import List, Optional, Tuple, Union import torch @@ -765,7 +764,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1016,7 +1015,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 023cb27848..979cef5b40 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -16,7 +16,6 @@ import math -import random from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -1224,7 +1223,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1378,7 +1377,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue if idx == 0: diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 168f342acd..76b6b4d485 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -587,7 +587,7 @@ class Data2VecAudioEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index c92c43e46d..165c98f1e6 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -16,7 +16,6 @@ import math -import random from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -979,7 +978,7 @@ class DetrEncoder(DetrPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1118,7 +1117,7 @@ class DetrDecoder(DetrPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 38705bec09..11f6f0fb3f 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -16,7 +16,6 @@ import itertools import math -import random from dataclasses import dataclass from typing import Dict, Optional, Tuple, Union @@ -580,7 +579,7 @@ class FlaubertModel(FlaubertPreTrainedModel): attentions = () if output_attentions else None for i in range(self.n_layers): # LayerDrop - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 22c3a0a248..35d34324c7 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -28,7 +28,6 @@ """PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19""" import math -import random from typing import Any, Dict, List, Optional, Tuple, Union import torch @@ -550,7 +549,7 @@ class FSMTEncoder(nn.Module): encoder_states += (x,) x = x.transpose(0, 1) # B x T x C -> T x B x C # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer attn = None else: @@ -794,7 +793,7 @@ class FSMTDecoder(nn.Module): x = x.transpose(0, 1) all_hidden_states += (x,) x = x.transpose(0, 1) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 70a8c07940..af3d4e2d0a 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -725,7 +725,7 @@ class HubertEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -814,7 +814,7 @@ class HubertEncoderStableLayerNorm(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index 2bf3f208a9..1645cacd3d 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -14,7 +14,6 @@ # limitations under the License. """ PyTorch Informer model.""" -import random from typing import List, Optional, Tuple, Union import numpy as np @@ -1205,7 +1204,7 @@ class InformerEncoder(InformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1425,7 +1424,7 @@ class InformerDecoder(InformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index a11659e389..38400590d3 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -16,7 +16,6 @@ import math -import random import warnings from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -1871,7 +1870,7 @@ class LEDEncoder(LEDPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None, None) @@ -2135,7 +2134,7 @@ class LEDDecoder(LEDPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index f8f9e1d3a8..a9cde571f7 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -16,7 +16,6 @@ import math -import random from typing import List, Optional, Tuple, Union import torch @@ -813,7 +812,7 @@ class M2M100Encoder(M2M100PreTrainedModel): encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -1057,7 +1056,7 @@ class M2M100Decoder(M2M100PreTrainedModel): all_hidden_states += (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index a75f833fb5..c1d6a67684 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -17,7 +17,6 @@ import copy import math -import random from typing import Dict, List, Optional, Tuple, Union import numpy as np @@ -778,7 +777,7 @@ class MarianEncoder(MarianPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1024,7 +1023,7 @@ class MarianDecoder(MarianPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 4cb2493e58..8609106292 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -15,7 +15,6 @@ """ PyTorch Mask2Former model.""" import math -import random import warnings from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -1862,7 +1861,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module): if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 830f8b23c8..55efe64da3 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -15,7 +15,6 @@ """ PyTorch MaskFormer model.""" import math -import random from dataclasses import dataclass from numbers import Number from typing import Dict, List, Optional, Tuple @@ -764,7 +763,7 @@ class DetrDecoder(nn.Module): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 67750ab42f..8a088b68ab 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -15,7 +15,6 @@ """ PyTorch MBART model.""" import copy import math -import random from typing import List, Optional, Tuple, Union import torch @@ -819,7 +818,7 @@ class MBartEncoder(MBartPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1074,7 +1073,7 @@ class MBartDecoder(MBartPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py index 08e280b3cc..7f2de9f952 100755 --- a/src/transformers/models/mctct/modeling_mctct.py +++ b/src/transformers/models/mctct/modeling_mctct.py @@ -16,7 +16,6 @@ import math -import random from typing import Optional, Tuple, Union import torch @@ -610,7 +609,7 @@ class MCTCTEncoder(MCTCTPreTrainedModel): encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 6a44768d8e..a1fca99dad 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -15,7 +15,6 @@ """ PyTorch MVP model.""" import copy import math -import random from typing import List, Optional, Tuple, Union import torch @@ -941,7 +940,7 @@ class MvpEncoder(MvpPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1216,7 +1215,7 @@ class MvpDecoder(MvpPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 06b61c7497..3585b1d3b6 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -16,7 +16,6 @@ import math -import random from typing import List, Optional, Tuple, Union import torch @@ -1143,7 +1142,7 @@ class NllbMoeEncoder(NllbMoePreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None, None) else: @@ -1405,7 +1404,7 @@ class NllbMoeDecoder(NllbMoePreTrainedModel): all_hidden_states += (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index bd64630c62..92c616bb63 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch OPT model.""" -import random from typing import List, Optional, Tuple, Union import torch @@ -685,7 +684,7 @@ class OPTDecoder(OPTPreTrainedModel): if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index a2bd3f3812..9565ee0d91 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -16,7 +16,6 @@ import copy import math -import random from typing import List, Optional, Tuple, Union import numpy as np @@ -793,7 +792,7 @@ class PegasusEncoder(PegasusPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1074,7 +1073,7 @@ class PegasusDecoder(PegasusPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 8e380a4de5..661cb85a3b 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -16,7 +16,6 @@ import dataclasses import math -import random from typing import Optional, Tuple, Union import numpy as np @@ -1060,7 +1059,7 @@ class PegasusXEncoder(PegasusXPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1315,7 +1314,7 @@ class PegasusXDecoder(PegasusXPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 365429360a..2a80ae3d59 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -15,7 +15,6 @@ """ PyTorch PLBART model.""" import copy import math -import random from typing import Any, Dict, List, Optional, Tuple, Union import torch @@ -798,7 +797,7 @@ class PLBartEncoder(PLBartPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1052,7 +1051,7 @@ class PLBartDecoder(PLBartPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index dd854c49f5..6b0869c87a 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -667,7 +667,7 @@ class SEWEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index d8a19084eb..bca2669ae1 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -16,7 +16,6 @@ import math -import random from typing import Optional, Tuple, Union import torch @@ -808,7 +807,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1053,7 +1052,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index c13b04642d..31e9bc34c9 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -17,7 +17,6 @@ import copy import math -import random from typing import Optional, Tuple, Union import torch @@ -662,7 +661,7 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 3e8ce5a23b..c91b90d63c 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -15,7 +15,6 @@ """ PyTorch SpeechT5 model.""" import math -import random import warnings from typing import List, Optional, Tuple, Union @@ -1381,7 +1380,7 @@ class SpeechT5Encoder(SpeechT5PreTrainedModel): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = self.training and (dropout_probability < self.layerdrop) if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -1706,7 +1705,7 @@ class SpeechT5Decoder(SpeechT5PreTrainedModel): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = self.training and (dropout_probability < self.layerdrop) if skip_the_layer and not deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 733ff7b9b4..d2de059470 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -16,7 +16,6 @@ import math -import random from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -920,7 +919,7 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1062,7 +1061,7 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index 8986ef6729..477a52a57c 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -15,7 +15,6 @@ # limitations under the License. """ PyTorch Time Series Transformer model.""" -import random from typing import List, Optional, Tuple, Union import numpy as np @@ -937,7 +936,7 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1151,7 +1150,7 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 6276c68a42..ede83af6ed 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -17,7 +17,6 @@ import copy import math -import random from typing import Optional, Tuple, Union import torch @@ -694,7 +693,7 @@ class TrOCRDecoder(TrOCRPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index e068fa59e5..16c08bbbf3 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -761,7 +761,7 @@ class UniSpeechEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -850,7 +850,7 @@ class UniSpeechEncoderStableLayerNorm(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 2ed8a5d572..b57369ea6f 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -775,7 +775,7 @@ class UniSpeechSatEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -864,7 +864,7 @@ class UniSpeechSatEncoderStableLayerNorm(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 43ab2408bb..1c8965c960 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -797,7 +797,7 @@ class Wav2Vec2Encoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -885,7 +885,7 @@ class Wav2Vec2EncoderStableLayerNorm(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 3e37a4a504..7a757d0a51 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -903,7 +903,7 @@ class Wav2Vec2ConformerEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index e4072d9372..d782a47402 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -707,7 +707,7 @@ class WavLMEncoder(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop) if not skip_the_layer or deepspeed_zero3_is_enabled: @@ -797,7 +797,7 @@ class WavLMEncoderStableLayerNorm(nn.Module): all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = np.random.uniform(0, 1) + dropout_probability = torch.rand([]) skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop) if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 42fda344f6..c5e9c94d3f 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -15,7 +15,6 @@ """ PyTorch Whisper model.""" import math -import random from typing import Optional, Tuple, Union import numpy as np @@ -916,7 +915,7 @@ class WhisperEncoder(WhisperPreTrainedModel): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -1145,7 +1144,7 @@ class WhisperDecoder(WhisperPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index 4a72b785a0..19ae63199c 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -16,7 +16,6 @@ import math -import random from typing import List, Optional, Tuple, Union import torch @@ -668,7 +667,7 @@ class XGLMModel(XGLMPreTrainedModel): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.rand([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index 4899e19598..879100aeaa 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -1560,7 +1560,6 @@ class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.ca {% else %} import math import copy -import random from typing import Optional, Tuple, List, Union import torch @@ -2306,7 +2305,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.randn([]) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: @@ -2543,7 +2542,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) + dropout_probability = torch.randn([]) if self.training and (dropout_probability < self.layerdrop): continue diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 4e09f21898..de38705ce4 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -464,6 +464,7 @@ class GenerationTesterMixin: **model_kwargs, ) # beam_search does not automatically interleave `batch_size` dim for `num_beams * num_return_sequences` + torch.manual_seed(0) kwargs = {} if model.config.is_encoder_decoder: encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs( @@ -482,7 +483,6 @@ class GenerationTesterMixin: logits_processor = LogitsProcessorList() logits_processor.append(InfNanRemoveLogitsProcessor()) - torch.manual_seed(0) with torch.no_grad(): model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {} output_beam_sample = model.beam_sample(