From d83b0e0c079f0826d186270a86622ff5f1efd9c1 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 18 Nov 2021 08:38:09 -0500 Subject: [PATCH] Add a post init method to all models (#14431) * Add a post init method to all models * Fix tests * Fix last tests * Fix templates * Add comment * Forgot to save --- src/transformers/modeling_utils.py | 27 +++++++------- .../models/albert/modeling_albert.py | 21 +++++++---- src/transformers/models/bart/modeling_bart.py | 15 +++++--- src/transformers/models/beit/modeling_beit.py | 12 ++++--- src/transformers/models/bert/modeling_bert.py | 27 +++++++++----- .../modeling_bert_generation.py | 6 ++-- .../models/big_bird/modeling_big_bird.py | 24 ++++++++----- .../modeling_bigbird_pegasus.py | 15 +++++--- .../models/blenderbot/modeling_blenderbot.py | 15 +++++--- .../modeling_blenderbot_small.py | 15 +++++--- .../models/canine/modeling_canine.py | 15 +++++--- src/transformers/models/clip/modeling_clip.py | 9 +++-- .../models/convbert/modeling_convbert.py | 18 ++++++---- src/transformers/models/ctrl/modeling_ctrl.py | 9 +++-- .../models/deberta/modeling_deberta.py | 15 +++++--- .../models/deberta_v2/modeling_deberta_v2.py | 15 +++++--- src/transformers/models/deit/modeling_deit.py | 9 +++-- src/transformers/models/detr/modeling_detr.py | 15 +++++--- .../models/distilbert/modeling_distilbert.py | 18 ++++++---- src/transformers/models/dpr/modeling_dpr.py | 15 +++++--- .../models/electra/modeling_electra.py | 21 +++++++---- .../models/flaubert/modeling_flaubert.py | 18 ++++++---- src/transformers/models/fnet/modeling_fnet.py | 24 ++++++++----- src/transformers/models/fsmt/modeling_fsmt.py | 3 +- .../models/funnel/modeling_funnel.py | 24 ++++++++----- src/transformers/models/gpt2/modeling_gpt2.py | 25 +++++++------ .../models/gpt_neo/modeling_gpt_neo.py | 9 +++-- src/transformers/models/gptj/modeling_gptj.py | 13 ++++--- .../models/hubert/modeling_hubert.py | 9 +++-- .../models/ibert/modeling_ibert.py | 18 ++++++---- .../models/layoutlm/modeling_layoutlm.py | 12 ++++--- .../models/layoutlmv2/modeling_layoutlmv2.py | 12 ++++--- src/transformers/models/led/modeling_led.py | 12 ++++--- .../models/longformer/modeling_longformer.py | 18 ++++++---- src/transformers/models/luke/modeling_luke.py | 12 ++++--- .../models/lxmert/modeling_lxmert.py | 9 +++-- .../models/m2m_100/modeling_m2m_100.py | 12 ++++--- .../models/marian/modeling_marian.py | 17 ++++++--- .../models/mbart/modeling_mbart.py | 20 ++++++++--- .../megatron_bert/modeling_megatron_bert.py | 27 +++++++++----- .../models/mobilebert/modeling_mobilebert.py | 24 ++++++++----- .../models/mpnet/modeling_mpnet.py | 18 ++++++---- .../models/openai/modeling_openai.py | 12 ++++--- .../models/pegasus/modeling_pegasus.py | 15 +++++--- .../models/prophetnet/modeling_prophetnet.py | 15 +++++--- .../models/reformer/modeling_reformer.py | 15 +++++--- .../models/rembert/modeling_rembert.py | 21 +++++++---- .../models/retribert/modeling_retribert.py | 3 +- .../models/roberta/modeling_roberta.py | 21 +++++++---- .../models/roformer/modeling_roformer.py | 21 +++++++---- .../models/segformer/modeling_segformer.py | 9 +++-- src/transformers/models/sew/modeling_sew.py | 9 +++-- .../models/sew_d/modeling_sew_d.py | 9 +++-- .../speech_to_text/modeling_speech_to_text.py | 12 ++++--- .../modeling_speech_to_text_2.py | 6 ++-- .../models/splinter/modeling_splinter.py | 6 ++-- .../squeezebert/modeling_squeezebert.py | 18 ++++++---- src/transformers/models/t5/modeling_t5.py | 12 ++++--- .../models/tapas/modeling_tapas.py | 12 ++++--- .../models/transfo_xl/modeling_transfo_xl.py | 9 +++-- .../models/trocr/modeling_trocr.py | 6 ++-- .../models/unispeech/modeling_unispeech.py | 12 ++++--- .../unispeech_sat/modeling_unispeech_sat.py | 12 ++++--- .../visual_bert/modeling_visual_bert.py | 18 ++++++---- src/transformers/models/vit/modeling_vit.py | 6 ++-- .../models/wav2vec2/modeling_wav2vec2.py | 15 +++++--- src/transformers/models/xlm/modeling_xlm.py | 21 +++++++---- .../models/xlnet/modeling_xlnet.py | 21 +++++++---- ...ng_{{cookiecutter.lowercase_modelname}}.py | 36 ++++++++++++------- tests/test_modeling_common.py | 8 ----- 70 files changed, 693 insertions(+), 359 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index f45c11087f..9ec64ebb73 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -412,17 +412,6 @@ class ModuleUtilsMixin: return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) -def gradient_checkpointing_hook(module, _): - # Hook to enable backward compatibility for gradient checkpointing. Will be removed once all models have a - # proper post_init method. - if getattr(module.config, "gradient_checkpointing", False): - module.gradient_checkpointing_enable() - # Remove the attribute now that is has been consumed, so it's no saved in the config. - delattr(module.config, "gradient_checkpointing") - # The hook will remove itself after the first execution - module._gradient_checkpointing_hook.remove() - - class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin): r""" Base class for all models. @@ -490,8 +479,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # Save config and origin of the pretrained weights if given in model self.config = config self.name_or_path = config.name_or_path - if self.supports_gradient_checkpointing: - self._gradient_checkpointing_hook = self.register_forward_pre_hook(gradient_checkpointing_hook) + + def post_init(self): + """ + A method executed at the end of each Transformer model initialization, to execute code that needs the model's + modules properly initialized (such as weight initialization). + """ + self.init_weights() + self._backward_compatibility_gradient_checkpointing() + + def _backward_compatibility_gradient_checkpointing(self): + if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False): + self.gradient_checkpointing_enable() + # Remove the attribute now that is has been consumed, so it's no saved in the config. + delattr(self.config, "gradient_checkpointing") @classmethod def _from_config(cls, config, **kwargs): diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 442242ad43..2e80eaf617 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -638,7 +638,8 @@ class AlbertModel(AlbertPreTrainedModel): self.pooler = None self.pooler_activation = None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -757,7 +758,8 @@ class AlbertForPreTraining(AlbertPreTrainedModel): self.predictions = AlbertMLMHead(config) self.sop_classifier = AlbertSOPHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.predictions.decoder @@ -903,7 +905,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): self.albert = AlbertModel(config, add_pooling_layer=False) self.predictions = AlbertMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.predictions.decoder @@ -991,7 +994,8 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1097,7 +1101,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1187,7 +1192,8 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): self.albert = AlbertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1286,7 +1292,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index f479a9069b..743b27887f 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -699,8 +699,9 @@ class BartEncoder(BartPretrainedModel): self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -870,8 +871,9 @@ class BartDecoder(BartPretrainedModel): self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1130,7 +1132,8 @@ class BartModel(BartPretrainedModel): self.encoder = BartEncoder(config, self.shared) self.decoder = BartDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -1248,7 +1251,8 @@ class BartForConditionalGeneration(BartPretrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -1666,7 +1670,8 @@ class BartForCausalLM(BartPretrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index c41537db50..c6f0d89093 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -598,7 +598,8 @@ class BeitModel(BeitPreTrainedModel): ) self.pooler = BeitPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.patch_embeddings @@ -715,7 +716,8 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel): self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @@ -805,7 +807,8 @@ class BeitForImageClassification(BeitPreTrainedModel): # Classifier head self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @@ -1121,7 +1124,8 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel): self.decode_head = BeitUperHead(config) self.auxiliary_head = BeitFCNHead(config) if config.use_auxiliary_head else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def compute_loss(self, logits, auxiliary_logits, labels): # upsample logits to the images' original size diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index a62e653b01..d48209898e 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -870,7 +870,8 @@ class BertModel(BertPreTrainedModel): self.pooler = BertPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1037,7 +1038,8 @@ class BertForPreTraining(BertPreTrainedModel): self.bert = BertModel(config) self.cls = BertPreTrainingHeads(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1145,7 +1147,8 @@ class BertLMHeadModel(BertPreTrainedModel): self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1294,7 +1297,8 @@ class BertForMaskedLM(BertPreTrainedModel): self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1394,7 +1398,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel): self.bert = BertModel(config) self.cls = BertOnlyNSPHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @@ -1501,7 +1506,8 @@ class BertForSequenceClassification(BertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1600,7 +1606,8 @@ class BertForMultipleChoice(BertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1698,7 +1705,8 @@ class BertForTokenClassification(BertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1788,7 +1796,8 @@ class BertForQuestionAnswering(BertPreTrainedModel): self.bert = BertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index ad0d5ba8b7..653e585e68 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -282,7 +282,8 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel): self.embeddings = BertGenerationEmbeddings(config) self.encoder = BertEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -456,7 +457,8 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel): self.bert = BertGenerationEncoder(config) self.lm_head = BertGenerationOnlyLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index ae415a7a79..17724e744f 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -1953,7 +1953,8 @@ class BigBirdModel(BigBirdPreTrainedModel): ) self.set_attention_type("original_full") - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -2262,7 +2263,8 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel): self.bert = BigBirdModel(config, add_pooling_layer=True) self.cls = BigBirdPreTrainingHeads(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -2370,7 +2372,8 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel): self.bert = BigBirdModel(config) self.cls = BigBirdOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -2472,7 +2475,8 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel): self.bert = BigBirdModel(config) self.cls = BigBirdOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -2642,7 +2646,8 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel): self.bert = BigBirdModel(config) self.classifier = BigBirdClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -2737,7 +2742,8 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -2834,7 +2840,8 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -2942,7 +2949,8 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer) self.qa_classifier = BigBirdForQuestionAnsweringHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index e22621c3d7..6bd0ae93dd 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1775,8 +1775,9 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)]) self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -2066,8 +2067,9 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): self.layers = nn.ModuleList([BigBirdPegasusDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -2327,7 +2329,8 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): self.encoder = BigBirdPegasusEncoder(config, self.shared) self.decoder = BigBirdPegasusDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -2447,7 +2450,8 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -2869,7 +2873,8 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 1911cd9e95..850739b036 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -656,8 +656,9 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel): self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -821,8 +822,9 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1083,7 +1085,8 @@ class BlenderbotModel(BlenderbotPreTrainedModel): self.encoder = BlenderbotEncoder(config, self.shared) self.decoder = BlenderbotDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): @@ -1220,7 +1223,8 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): @@ -1404,7 +1408,8 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 26dd44d9f0..51472a7e3b 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -657,8 +657,9 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -821,8 +822,9 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1081,7 +1083,8 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel): self.encoder = BlenderbotSmallEncoder(config, self.shared) self.decoder = BlenderbotSmallDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -1208,7 +1211,8 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -1379,7 +1383,8 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index b461a6c045..f2ba24ec0d 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -1015,7 +1015,8 @@ class CanineModel(CaninePreTrainedModel): self.pooler = CaninePooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def _prune_heads(self, heads_to_prune): """ @@ -1273,7 +1274,8 @@ class CanineForSequenceClassification(CaninePreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1369,7 +1371,8 @@ class CanineForMultipleChoice(CaninePreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1461,7 +1464,8 @@ class CanineForTokenClassification(CaninePreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1548,7 +1552,8 @@ class CanineForQuestionAnswering(CaninePreTrainedModel): self.canine = CanineModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index dfd8596fd1..08a9d0cb9a 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -683,7 +683,8 @@ class CLIPTextModel(CLIPPreTrainedModel): def __init__(self, config: CLIPTextConfig): super().__init__(config) self.text_model = CLIPTextTransformer(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self) -> nn.Module: return self.text_model.embeddings.token_embedding @@ -792,7 +793,8 @@ class CLIPVisionModel(CLIPPreTrainedModel): def __init__(self, config: CLIPVisionConfig): super().__init__(config) self.vision_model = CLIPVisionTransformer(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding @@ -866,7 +868,8 @@ class CLIPModel(CLIPPreTrainedModel): self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING) def get_text_features( diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index 2d4b0c57ca..bee2b2ae62 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -775,7 +775,8 @@ class ConvBertModel(ConvBertPreTrainedModel): self.encoder = ConvBertEncoder(config) self.config = config - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -886,7 +887,8 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel): self.generator_predictions = ConvBertGeneratorPredictions(config) self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.generator_lm_head @@ -995,7 +997,8 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel): self.convbert = ConvBertModel(config) self.classifier = ConvBertClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1090,7 +1093,8 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel): self.sequence_summary = SequenceSummary(config) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -1187,7 +1191,8 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1274,7 +1279,8 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): self.convbert = ConvBertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index 9c06e20269..58e147b4b5 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -338,7 +338,8 @@ class CTRLModel(CTRLPreTrainedModel): ) self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.w @@ -499,7 +500,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): self.transformer = CTRLModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head @@ -615,7 +617,8 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel): self.transformer = CTRLModel(config) self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) @add_code_sample_docstrings( diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 4a54ab6340..2d2edd8f7d 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -888,7 +888,8 @@ class DebertaModel(DebertaPreTrainedModel): self.encoder = DebertaEncoder(config) self.z_steps = 0 self.config = config - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1001,7 +1002,8 @@ class DebertaForMaskedLM(DebertaPreTrainedModel): self.deberta = DebertaModel(config) self.cls = DebertaOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1141,7 +1143,8 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel): drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = StableDropout(drop_out) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.deberta.get_input_embeddings() @@ -1254,7 +1257,8 @@ class DebertaForTokenClassification(DebertaPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1338,7 +1342,8 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel): self.deberta = DebertaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index b1ec6bd011..e0c78395e6 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -996,7 +996,8 @@ class DebertaV2Model(DebertaV2PreTrainedModel): self.encoder = DebertaV2Encoder(config) self.z_steps = 0 self.config = config - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1110,7 +1111,8 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): self.deberta = DebertaV2Model(config) self.cls = DebertaV2OnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1251,7 +1253,8 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = StableDropout(drop_out) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.deberta.get_input_embeddings() @@ -1365,7 +1368,8 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1450,7 +1454,8 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): self.deberta = DebertaV2Model(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 4a44d67fe9..e47e88b849 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -458,7 +458,8 @@ class DeiTModel(DeiTPreTrainedModel): self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pooler = DeiTPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.patch_embeddings @@ -574,7 +575,8 @@ class DeiTForImageClassification(DeiTPreTrainedModel): # Classifier head self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @@ -711,7 +713,8 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel): nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() ) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DeiTForImageClassificationWithTeacherOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 70287626b2..e7771a4adb 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -894,7 +894,8 @@ class DetrEncoder(DetrPreTrainedModel): # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -1001,8 +1002,9 @@ class DetrDecoder(DetrPreTrainedModel): # in DETR, the decoder uses layernorm after the last decoder layer output self.layernorm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -1179,7 +1181,8 @@ class DetrModel(DetrPreTrainedModel): self.encoder = DetrEncoder(config) self.decoder = DetrDecoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.encoder @@ -1333,7 +1336,8 @@ class DetrForObjectDetection(DetrPreTrainedModel): input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 ) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py @torch.jit.unused @@ -1494,7 +1498,8 @@ class DetrForSegmentation(DetrPreTrainedModel): hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std ) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index b68b7c524e..a79b452394 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -441,7 +441,8 @@ class DistilBertModel(DistilBertPreTrainedModel): self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_position_embeddings(self) -> nn.Embedding: """ @@ -571,7 +572,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() self.mlm_loss_fct = nn.CrossEntropyLoss() @@ -677,7 +679,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_position_embeddings(self) -> nn.Embedding: """ @@ -793,7 +796,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_position_embeddings(self) -> nn.Embedding: """ @@ -910,7 +914,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_position_embeddings(self) -> nn.Embedding: """ @@ -1015,7 +1020,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): self.classifier = nn.Linear(config.dim, 1) self.dropout = nn.Dropout(config.seq_classif_dropout) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_position_embeddings(self) -> nn.Embedding: """ diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index 091479af4b..6cde47678c 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -180,7 +180,8 @@ class DPREncoder(DPRPreTrainedModel): self.projection_dim = config.projection_dim if self.projection_dim > 0: self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -232,7 +233,8 @@ class DPRSpanPredictor(DPRPreTrainedModel): self.encoder = DPREncoder(config) self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2) self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -447,7 +449,8 @@ class DPRContextEncoder(DPRPretrainedContextEncoder): super().__init__(config) self.config = config self.ctx_encoder = DPREncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC) @@ -525,7 +528,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): super().__init__(config) self.config = config self.question_encoder = DPREncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC) @@ -602,7 +606,8 @@ class DPRReader(DPRPretrainedReader): super().__init__(config) self.config = config self.span_predictor = DPRSpanPredictor(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index eb26bcfdd9..c94a5c408b 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -817,7 +817,8 @@ class ElectraModel(ElectraPreTrainedModel): self.encoder = ElectraEncoder(config) self.config = config - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -939,7 +940,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): self.electra = ElectraModel(config) self.classifier = ElectraClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1033,7 +1035,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel): self.electra = ElectraModel(config) self.discriminator_predictions = ElectraDiscriminatorPredictions(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -1128,7 +1131,8 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): self.generator_predictions = ElectraGeneratorPredictions(config) self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.generator_lm_head @@ -1216,7 +1220,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ) self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1305,7 +1310,8 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): self.electra = ElectraModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1406,7 +1412,8 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel): self.sequence_summary = SequenceSummary(config) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index f0f14caa39..9887b639c0 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -336,7 +336,8 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel): def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings( @@ -357,7 +358,8 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification): def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings( @@ -378,7 +380,8 @@ class FlaubertForTokenClassification(XLMForTokenClassification): def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings( @@ -399,7 +402,8 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple): def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings( @@ -420,7 +424,8 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering): def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings( @@ -441,4 +446,5 @@ class FlaubertForMultipleChoice(XLMForMultipleChoice): def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 16ae695b34..76aa1aa504 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -535,7 +535,8 @@ class FNetModel(FNetPreTrainedModel): self.pooler = FNetPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -633,7 +634,8 @@ class FNetForPreTraining(FNetPreTrainedModel): self.fnet = FNetModel(config) self.cls = FNetPreTrainingHeads(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -723,7 +725,8 @@ class FNetForMaskedLM(FNetPreTrainedModel): self.fnet = FNetModel(config) self.cls = FNetOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -791,7 +794,8 @@ class FNetForNextSentencePrediction(FNetPreTrainedModel): self.fnet = FNetModel(config) self.cls = FNetOnlyNSPHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @@ -885,7 +889,8 @@ class FNetForSequenceClassification(FNetPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -969,7 +974,8 @@ class FNetForMultipleChoice(FNetPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1050,7 +1056,8 @@ class FNetForTokenClassification(FNetPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1119,7 +1126,8 @@ class FNetForQuestionAnswering(FNetPreTrainedModel): self.fnet = FNetModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 9ddcd1453b..a1c4c1ed8c 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -1003,7 +1003,8 @@ class FSMTModel(PretrainedFSMTModel): self.encoder = FSMTEncoder(config, encoder_embed_tokens) self.decoder = FSMTDecoder(config, decoder_embed_tokens) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING) @add_code_sample_docstrings( diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 7ce2e3221c..fffed242fd 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -900,7 +900,8 @@ class FunnelBaseModel(FunnelPreTrainedModel): self.embeddings = FunnelEmbeddings(config) self.encoder = FunnelEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -977,7 +978,8 @@ class FunnelModel(FunnelPreTrainedModel): self.encoder = FunnelEncoder(config) self.decoder = FunnelDecoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1082,7 +1084,8 @@ class FunnelForPreTraining(FunnelPreTrainedModel): self.funnel = FunnelModel(config) self.discriminator_predictions = FunnelDiscriminatorPredictions(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -1164,7 +1167,8 @@ class FunnelForMaskedLM(FunnelPreTrainedModel): self.funnel = FunnelModel(config) self.lm_head = nn.Linear(config.d_model, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head @@ -1244,7 +1248,8 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel): self.funnel = FunnelBaseModel(config) self.classifier = FunnelClassificationHead(config, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1334,7 +1339,8 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel): self.funnel = FunnelBaseModel(config) self.classifier = FunnelClassificationHead(config, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1420,7 +1426,8 @@ class FunnelForTokenClassification(FunnelPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1502,7 +1509,8 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel): self.funnel = FunnelModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index d8f09d3e72..77ef0386ea 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -677,13 +677,14 @@ class GPT2Model(GPT2PreTrainedModel): self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - self.init_weights() - # Model parallel self.model_parallel = False self.device_map = None self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): # Check validity of device_map @@ -947,12 +948,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) - self.init_weights() - # Model parallel self.model_parallel = False self.device_map = None + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): self.device_map = ( @@ -1117,12 +1119,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) - self.init_weights() - # Model parallel self.model_parallel = False self.device_map = None + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): self.device_map = ( @@ -1330,12 +1333,13 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel): self.transformer = GPT2Model(config) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) - self.init_weights() - # Model parallel self.model_parallel = False self.device_map = None + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, @@ -1461,12 +1465,13 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() - # Model parallel self.model_parallel = False self.device_map = None + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 1be7de2f2c..9785178ce8 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -486,8 +486,9 @@ class GPTNeoModel(GPTNeoPreTrainedModel): self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.wte @@ -675,7 +676,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel): self.transformer = GPTNeoModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head @@ -823,7 +825,8 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel): self.transformer = GPTNeoModel(config) self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING) @add_code_sample_docstrings( diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 7c01fea81d..603619cc5a 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -444,13 +444,15 @@ class GPTJModel(GPTJPreTrainedModel): self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)]) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - self.init_weights() # Model parallel self.model_parallel = False self.device_map = None self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): # Check validity of device_map @@ -680,12 +682,14 @@ class GPTJForCausalLM(GPTJPreTrainedModel): super().__init__(config) self.transformer = GPTJModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size) - self.init_weights() # Model parallel self.model_parallel = False self.device_map = None + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): self.device_map = ( @@ -855,12 +859,13 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel): self.transformer = GPTJModel(config) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) - self.init_weights() - # Model parallel self.model_parallel = False self.device_map = None + # Initialize weights and apply final processing + self.post_init() + @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 07e61ec181..2f8c59257c 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -899,7 +899,8 @@ class HubertModel(HubertPreTrainedModel): else: self.encoder = HubertEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states def _mask_hidden_states( @@ -1039,7 +1040,8 @@ class HubertForCTC(HubertPreTrainedModel): ) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ @@ -1147,7 +1149,8 @@ class HubertForSequenceClassification(HubertPreTrainedModel): self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 8173ce1be8..6666258e70 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -754,7 +754,8 @@ class IBertModel(IBertPreTrainedModel): self.pooler = IBertPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -865,7 +866,8 @@ class IBertForMaskedLM(IBertPreTrainedModel): self.ibert = IBertModel(config, add_pooling_layer=False) self.lm_head = IBertLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -979,7 +981,8 @@ class IBertForSequenceClassification(IBertPreTrainedModel): self.ibert = IBertModel(config, add_pooling_layer=False) self.classifier = IBertClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1074,7 +1077,8 @@ class IBertForMultipleChoice(IBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1168,7 +1172,8 @@ class IBertForTokenClassification(IBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1277,7 +1282,8 @@ class IBertForQuestionAnswering(IBertPreTrainedModel): self.ibert = IBertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 541dec879a..186146e120 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -714,7 +714,8 @@ class LayoutLMModel(LayoutLMPreTrainedModel): self.encoder = LayoutLMEncoder(config) self.pooler = LayoutLMPooler(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -856,7 +857,8 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): self.layoutlm = LayoutLMModel(config) self.cls = LayoutLMOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings @@ -979,7 +981,8 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings @@ -1109,7 +1112,8 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index e80029a300..93706000e6 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -724,7 +724,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel): self.encoder = LayoutLMv2Encoder(config) self.pooler = LayoutLMv2Pooler(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -957,7 +958,8 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.layoutlmv2.embeddings.word_embeddings @@ -1124,7 +1126,8 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.layoutlmv2.embeddings.word_embeddings @@ -1239,7 +1242,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel): self.layoutlmv2 = LayoutLMv2Model(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.layoutlmv2.embeddings.word_embeddings diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 5433d0a7c7..2f15448522 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -1629,8 +1629,9 @@ class LEDEncoder(LEDPreTrainedModel): self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)]) self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor): # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) @@ -1904,8 +1905,9 @@ class LEDDecoder(LEDPreTrainedModel): self.layers = nn.ModuleList([LEDDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -2156,7 +2158,8 @@ class LEDModel(LEDPreTrainedModel): self.encoder = LEDEncoder(config, self.shared) self.decoder = LEDDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -2283,7 +2286,8 @@ class LEDForConditionalGeneration(LEDPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.led.get_encoder() diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 672c0d948a..93bfd90dfb 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1511,7 +1511,8 @@ class LongformerModel(LongformerPreTrainedModel): self.encoder = LongformerEncoder(config) self.pooler = LongformerPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1713,7 +1714,8 @@ class LongformerForMaskedLM(LongformerPreTrainedModel): self.longformer = LongformerModel(config, add_pooling_layer=False) self.lm_head = LongformerLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -1818,7 +1820,8 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel): self.longformer = LongformerModel(config, add_pooling_layer=False) self.classifier = LongformerClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1943,7 +1946,8 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel): self.longformer = LongformerModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) @@ -2080,7 +2084,8 @@ class LongformerForTokenClassification(LongformerPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -2170,7 +2175,8 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index 97d1f1adfd..6edd84a3ce 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -818,7 +818,8 @@ class LukeModel(LukePreTrainedModel): self.pooler = LukePooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1029,7 +1030,8 @@ class LukeForEntityClassification(LukePreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=EntityClassificationOutput, config_class=_CONFIG_FOR_DOC) @@ -1142,7 +1144,8 @@ class LukeForEntityPairClassification(LukePreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels, False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=EntityPairClassificationOutput, config_class=_CONFIG_FOR_DOC) @@ -1257,7 +1260,8 @@ class LukeForEntitySpanClassification(LukePreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=EntitySpanClassificationOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 1135816cc2..c78e36fddb 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -891,7 +891,8 @@ class LxmertModel(LxmertPreTrainedModel): self.embeddings = LxmertEmbeddings(config) self.encoder = LxmertEncoder(config) self.pooler = LxmertPooler(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1048,7 +1049,8 @@ class LxmertForPreTraining(LxmertPreTrainedModel): self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels) # Weight initialization - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Loss functions self.loss_fcts = { @@ -1303,7 +1305,8 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel): self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels) # Weight initialization - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Loss function self.loss = CrossEntropyLoss() diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index 1230bf01e7..4c9caadd8c 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -705,8 +705,9 @@ class M2M100Encoder(M2M100PreTrainedModel): self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -870,8 +871,9 @@ class M2M100Decoder(M2M100PreTrainedModel): self.layers = nn.ModuleList([M2M100DecoderLayer(config) for _ in range(config.decoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -1113,7 +1115,8 @@ class M2M100Model(M2M100PreTrainedModel): self.encoder = M2M100Encoder(config, self.shared) self.decoder = M2M100Decoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -1232,7 +1235,8 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel): self.model = M2M100Model(config) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 94f0f800bd..ef4369dcb8 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -668,8 +668,10 @@ class MarianEncoder(MarianPreTrainedModel): self.padding_idx, ) self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)]) - self.init_weights() + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -829,8 +831,10 @@ class MarianDecoder(MarianPreTrainedModel): self.padding_idx, ) self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)]) - self.init_weights() + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1087,7 +1091,8 @@ class MarianModel(MarianPreTrainedModel): self.encoder = MarianEncoder(config, self.shared) self.decoder = MarianDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -1220,7 +1225,8 @@ class MarianMTModel(MarianPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -1399,7 +1405,8 @@ class MarianForCausalLM(MarianPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 564030fb49..82452251bf 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -696,8 +696,14 @@ class MBartEncoder(MBartPreTrainedModel): self.layernorm_embedding = nn.LayerNorm(embed_dim) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def _backward_compatibility_gradient_checkpointing(self): + # Override to not delete the attribute from the config + if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False): + self.gradient_checkpointing_enable() def forward( self, @@ -862,8 +868,9 @@ class MBartDecoder(MBartPreTrainedModel): self.layernorm_embedding = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1123,7 +1130,8 @@ class MBartModel(MBartPreTrainedModel): self.encoder = MBartEncoder(config, self.shared) self.decoder = MBartDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -1243,7 +1251,8 @@ class MBartForConditionalGeneration(MBartPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -1664,7 +1673,8 @@ class MBartForCausalLM(MBartPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index c482b1b639..12f026f63c 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -857,7 +857,8 @@ class MegatronBertModel(MegatronBertPreTrainedModel): self.pooler = MegatronBertPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1018,7 +1019,8 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel): self.bert = MegatronBertModel(config) self.cls = MegatronBertPreTrainingHeads(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1127,7 +1129,8 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel): self.bert = MegatronBertModel(config, add_pooling_layer=False) self.cls = MegatronBertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1274,7 +1277,8 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): self.bert = MegatronBertModel(config, add_pooling_layer=False) self.cls = MegatronBertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1375,7 +1379,8 @@ class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel): self.bert = MegatronBertModel(config) self.cls = MegatronBertOnlyNSPHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @@ -1478,7 +1483,8 @@ class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1574,7 +1580,8 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -1671,7 +1678,8 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1761,7 +1769,8 @@ class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel): self.bert = MegatronBertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 3c85af3b1b..28c01d5521 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -799,7 +799,8 @@ class MobileBertModel(MobileBertPreTrainedModel): self.pooler = MobileBertPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -907,7 +908,8 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): self.mobilebert = MobileBertModel(config) self.cls = MobileBertPreTrainingHeads(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1015,7 +1017,8 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): self.cls = MobileBertOnlyMLMHead(config) self.config = config - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1111,7 +1114,8 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): self.mobilebert = MobileBertModel(config) self.cls = MobileBertOnlyNSPHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @@ -1218,7 +1222,8 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1318,7 +1323,8 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): self.mobilebert = MobileBertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1421,7 +1427,8 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -1522,7 +1529,8 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 52cf537ed2..70e2d09a93 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -493,7 +493,8 @@ class MPNetModel(MPNetPreTrainedModel): self.encoder = MPNetEncoder(config) self.pooler = MPNetPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -583,7 +584,8 @@ class MPNetForMaskedLM(MPNetPreTrainedModel): self.mpnet = MPNetModel(config, add_pooling_layer=False) self.lm_head = MPNetLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -691,7 +693,8 @@ class MPNetForSequenceClassification(MPNetPreTrainedModel): self.mpnet = MPNetModel(config, add_pooling_layer=False) self.classifier = MPNetClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -785,7 +788,8 @@ class MPNetForMultipleChoice(MPNetPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -877,7 +881,8 @@ class MPNetForTokenClassification(MPNetPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -985,7 +990,8 @@ class MPNetForQuestionAnswering(MPNetPreTrainedModel): self.mpnet = MPNetModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 58dd1b055d..782812b7e7 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -414,7 +414,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)]) self.register_buffer("position_ids", torch.arange(config.n_positions)) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.tokens_embed @@ -540,7 +541,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head @@ -629,7 +631,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head @@ -750,7 +753,8 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel): self.transformer = OpenAIGPTModel(config) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING) @add_code_sample_docstrings( diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 33a1ca14cb..4929c10141 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -658,8 +658,9 @@ class PegasusEncoder(PegasusPreTrainedModel): self.layers = nn.ModuleList([PegasusEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def resize_position_embeddings(self, new_num_position_embeddings: int): """ @@ -853,8 +854,9 @@ class PegasusDecoder(PegasusPreTrainedModel): self.layers = nn.ModuleList([PegasusDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1142,7 +1144,8 @@ class PegasusModel(PegasusPreTrainedModel): self.encoder = PegasusEncoder(config, self.shared) self.decoder = PegasusDecoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -1293,7 +1296,8 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel): self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -1490,7 +1494,8 @@ class PegasusForCausalLM(PegasusPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 9f72a35f0d..a3e89aa69f 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -1266,8 +1266,9 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel): self.layers = nn.ModuleList([ProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)]) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.word_embeddings @@ -1411,8 +1412,9 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel): self.layers = nn.ModuleList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)]) self.embeddings_layer_norm = LayerNorm(config.hidden_size) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.word_embeddings @@ -1765,7 +1767,8 @@ class ProphetNetModel(ProphetNetPreTrainedModel): decoder_config.is_encoder_decoder = False self.decoder = ProphetNetDecoder(decoder_config, self.word_embeddings) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.word_embeddings @@ -1882,7 +1885,8 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head @@ -2092,7 +2096,8 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.prophetnet.decoder.word_embeddings diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index 528875b4aa..eae0b30092 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1974,7 +1974,8 @@ class ReformerModel(ReformerPreTrainedModel): self.embeddings = ReformerEmbeddings(config) self.encoder = ReformerEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -2188,7 +2189,8 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): self.reformer = ReformerModel(config) self.lm_head = ReformerOnlyLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -2303,7 +2305,8 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): self.reformer = ReformerModel(config) self.lm_head = ReformerOnlyLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -2390,7 +2393,8 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel): if config.is_decoder is True: logger.warning("You might want to disable causal masking for sequence classification") - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING) @add_code_sample_docstrings( @@ -2508,7 +2512,8 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): # 2 * config.hidden_size because we use reversible residual layers self.qa_outputs = nn.Linear(2 * config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING) @add_code_sample_docstrings( diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index 3ebbde7fa7..bc5569e553 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -765,7 +765,8 @@ class RemBertModel(RemBertPreTrainedModel): self.pooler = RemBertPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -925,7 +926,8 @@ class RemBertForMaskedLM(RemBertPreTrainedModel): self.rembert = RemBertModel(config, add_pooling_layer=False) self.cls = RemBertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1027,7 +1029,8 @@ class RemBertForCausalLM(RemBertPreTrainedModel): self.rembert = RemBertModel(config, add_pooling_layer=False) self.cls = RemBertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1173,7 +1176,8 @@ class RemBertForSequenceClassification(RemBertPreTrainedModel): self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1269,7 +1273,8 @@ class RemBertForMultipleChoice(RemBertPreTrainedModel): self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1361,7 +1366,8 @@ class RemBertForTokenClassification(RemBertPreTrainedModel): self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1449,7 +1455,8 @@ class RemBertForQuestionAnswering(RemBertPreTrainedModel): self.rembert = RemBertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py index 08f56e13ee..2456545a22 100644 --- a/src/transformers/models/retribert/modeling_retribert.py +++ b/src/transformers/models/retribert/modeling_retribert.py @@ -99,7 +99,8 @@ class RetriBertModel(RetriBertPreTrainedModel): self.ce_loss = nn.CrossEntropyLoss(reduction="mean") - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def embed_sentences_checkpointed( self, diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 917bb03f80..5f85a81fe8 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -723,7 +723,8 @@ class RobertaModel(RobertaPreTrainedModel): self.pooler = RobertaPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -897,7 +898,8 @@ class RobertaForCausalLM(RobertaPreTrainedModel): # The LM head weights require special treatment only when they are tied with the word embeddings self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -1050,7 +1052,8 @@ class RobertaForMaskedLM(RobertaPreTrainedModel): # The LM head weights require special treatment only when they are tied with the word embeddings self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_head.decoder @@ -1169,7 +1172,8 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel): self.roberta = RobertaModel(config, add_pooling_layer=False) self.classifier = RobertaClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1265,7 +1269,8 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1362,7 +1367,8 @@ class RobertaForTokenClassification(RobertaPreTrainedModel): self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1474,7 +1480,8 @@ class RobertaForQuestionAnswering(RobertaPreTrainedModel): self.roberta = RobertaModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 75f690390e..14e74a24f8 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -817,7 +817,8 @@ class RoFormerModel(RoFormerPreTrainedModel): self.encoder = RoFormerEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -973,7 +974,8 @@ class RoFormerForMaskedLM(RoFormerPreTrainedModel): self.roformer = RoFormerModel(config) self.cls = RoFormerOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1073,7 +1075,8 @@ class RoFormerForCausalLM(RoFormerPreTrainedModel): self.roformer = RoFormerModel(config) self.cls = RoFormerOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1238,7 +1241,8 @@ class RoFormerForSequenceClassification(RoFormerPreTrainedModel): self.roformer = RoFormerModel(config) self.classifier = RoFormerClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1330,7 +1334,8 @@ class RoFormerForMultipleChoice(RoFormerPreTrainedModel): self.sequence_summary = SequenceSummary(config) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -1422,7 +1427,8 @@ class RoFormerForTokenClassification(RoFormerPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1509,7 +1515,8 @@ class RoFormerForQuestionAnswering(RoFormerPreTrainedModel): self.roformer = RoFormerModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 52486ef377..e365febccb 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -467,7 +467,8 @@ class SegformerModel(SegformerPreTrainedModel): # hierarchical Transformer encoder self.encoder = SegformerEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def _prune_heads(self, heads_to_prune): """ @@ -541,7 +542,8 @@ class SegformerForImageClassification(SegformerPreTrainedModel): # Classifier head self.classifier = nn.Linear(config.hidden_sizes[-1], config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @@ -696,7 +698,8 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel): self.segformer = SegformerModel(config) self.decode_head = SegformerDecodeHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 55d6bf7cae..fd4cf4bf4d 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -798,7 +798,8 @@ class SEWModel(SEWPreTrainedModel): self.encoder = SEWEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states def _mask_hidden_states( @@ -924,7 +925,8 @@ class SEWForCTC(SEWPreTrainedModel): ) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ @@ -1032,7 +1034,8 @@ class SEWForSequenceClassification(SEWPreTrainedModel): self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index facb85f07b..53f9862b47 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1329,7 +1329,8 @@ class SEWDModel(SEWDPreTrainedModel): self.encoder = SEWDEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states def _mask_hidden_states( @@ -1455,7 +1456,8 @@ class SEWDForCTC(SEWDPreTrainedModel): ) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ @@ -1563,7 +1565,8 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel): self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index e631e75731..aead484a59 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -723,8 +723,9 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): self.layers = nn.ModuleList([Speech2TextEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -876,8 +877,9 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel): self.layer_norm = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -1130,7 +1132,8 @@ class Speech2TextModel(Speech2TextPreTrainedModel): self.encoder = Speech2TextEncoder(config) self.decoder = Speech2TextDecoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.decoder.embed_tokens @@ -1253,7 +1256,8 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): self.model = Speech2TextModel(config) self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index 306cacd48f..a6b4e5b542 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -476,8 +476,9 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel): self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)]) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -751,7 +752,8 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 812a5f2070..19dab0457d 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -619,7 +619,8 @@ class SplinterModel(SplinterPreTrainedModel): self.embeddings = SplinterEmbeddings(config) self.encoder = SplinterEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -834,7 +835,8 @@ class SplinterForQuestionAnswering(SplinterPreTrainedModel): self.splinter_qass = QuestionAwareSpanSelectionHead(config) self.question_token_id = config.question_token_id - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 6ec972f06c..ba12c2341d 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -553,7 +553,8 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel): self.encoder = SqueezeBertEncoder(config) self.pooler = SqueezeBertPooler(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -654,7 +655,8 @@ class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel): self.transformer = SqueezeBertModel(config) self.cls = SqueezeBertOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -739,7 +741,8 @@ class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -836,7 +839,8 @@ class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -930,7 +934,8 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1017,7 +1022,8 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel): self.transformer = SqueezeBertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index a9f69c91f5..78ccd07236 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -814,7 +814,8 @@ class T5Stack(T5PreTrainedModel): self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Model parallel self.model_parallel = False self.device_map = None @@ -1267,7 +1268,8 @@ class T5Model(T5PreTrainedModel): decoder_config.num_layers = config.num_decoder_layers self.decoder = T5Stack(decoder_config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Model parallel self.model_parallel = False @@ -1457,7 +1459,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel): self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Model parallel self.model_parallel = False @@ -1731,7 +1734,8 @@ class T5EncoderModel(T5PreTrainedModel): encoder_config.is_encoder_decoder = False self.encoder = T5Stack(encoder_config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Model parallel self.model_parallel = False diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index e301a2eca5..7ff9081fab 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -877,7 +877,8 @@ class TapasModel(TapasPreTrainedModel): self.pooler = TapasPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1016,7 +1017,8 @@ class TapasForMaskedLM(TapasPreTrainedModel): self.tapas = TapasModel(config, add_pooling_layer=False) self.cls = TapasOnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1146,7 +1148,8 @@ class TapasForQuestionAnswering(TapasPreTrainedModel): if config.num_aggregation_labels > 0: self.aggregation_classifier = nn.Linear(config.hidden_size, config.num_aggregation_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=TableQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC) @@ -1464,7 +1467,8 @@ class TapasForSequenceClassification(TapasPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index e18a5b50f6..fda73520c9 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -819,7 +819,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel): else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.word_emb @@ -1021,7 +1022,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val ) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def tie_weights(self): """ @@ -1170,7 +1172,8 @@ class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): self.num_labels = config.num_labels self.transformer = TransfoXLModel(config) self.score = nn.Linear(config.d_embed, self.num_labels, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) @add_code_sample_docstrings( diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 87502901ea..5b8943a26a 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -503,8 +503,9 @@ class TrOCRDecoder(TrOCRPreTrainedModel): self.layers = nn.ModuleList([TrOCRDecoderLayer(config) for _ in range(config.decoder_layers)]) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -784,7 +785,8 @@ class TrOCRForCausalLM(TrOCRPreTrainedModel): self.output_projection = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 3f700ee153..cd4ff01081 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1045,7 +1045,8 @@ class UniSpeechModel(UniSpeechPreTrainedModel): else: self.encoder = UniSpeechEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states def _mask_hidden_states( @@ -1165,7 +1166,8 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel): self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes) self.dropout = nn.Dropout(config.final_dropout) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def set_gumbel_temperature(self, temperature: int): """ @@ -1337,7 +1339,8 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel): ) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ @@ -1445,7 +1448,8 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index ae28492064..c69faafc43 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1046,7 +1046,8 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): else: self.encoder = UniSpeechSatEncoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states def _mask_hidden_states( @@ -1171,7 +1172,8 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): if self.config.do_stable_layer_norm: self.layer_norm_for_extract.requires_grad = False - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def set_gumbel_temperature(self, temperature: int): """ @@ -1328,7 +1330,8 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel): ) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ @@ -1436,7 +1439,8 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel): self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 6d8d51b4ab..eabca9ad4c 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -701,7 +701,8 @@ class VisualBertModel(VisualBertPreTrainedModel): if self.bypass_transformer: self.additional_layer = VisualBertLayer(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -877,7 +878,8 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel): self.visual_bert = VisualBertModel(config) self.cls = VisualBertPreTrainingHeads(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1021,7 +1023,8 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.cls = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward( VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") @@ -1170,7 +1173,8 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.cls = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @@ -1292,7 +1296,8 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.cls = nn.Linear(config.hidden_size, config.num_labels) # 2 - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @@ -1448,7 +1453,8 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel): self.cls = VisualBertPreTrainingHeads(config) self.attention = VisualBertRegionToPhraseAttention(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index fda10a1ece..b1bc303124 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -487,7 +487,8 @@ class ViTModel(ViTPreTrainedModel): self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pooler = ViTPooler(config) if add_pooling_layer else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.patch_embeddings @@ -603,7 +604,8 @@ class ViTForImageClassification(ViTPreTrainedModel): # Classifier head self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 0bb456620b..00eec6933b 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1152,7 +1152,8 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def _mask_hidden_states( self, @@ -1269,7 +1270,8 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): self.quantizer = Wav2Vec2GumbelVectorQuantizer(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() # make sure that project_hid & project_q are initialized like normal linear layers self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim) @@ -1480,7 +1482,8 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): self.dropout = nn.Dropout(config.final_dropout) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Wav2Vec2BaseModelOutput, config_class=_CONFIG_FOR_DOC) @@ -1563,7 +1566,8 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): ) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ @@ -1670,7 +1674,8 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def freeze_feature_extractor(self): """ diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index 4d4b8c0c8d..c3219952c2 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -469,7 +469,8 @@ class XLMModel(XLMPreTrainedModel): if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) def get_input_embeddings(self): @@ -687,7 +688,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): self.transformer = XLMModel(config) self.pred_layer = XLMPredLayer(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.pred_layer.proj @@ -785,7 +787,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel): self.transformer = XLMModel(config) self.sequence_summary = SequenceSummary(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -885,7 +888,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): self.transformer = XLMModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -989,7 +993,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): self.transformer = XLMModel(config) self.qa_outputs = SQuADHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=XLMForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC) @@ -1108,7 +1113,8 @@ class XLMForTokenClassification(XLMPreTrainedModel): self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1201,7 +1207,8 @@ class XLMForMultipleChoice(XLMPreTrainedModel): self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.num_labels, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index 70c37ad84f..10aadbdfba 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -955,7 +955,8 @@ class XLNetModel(XLNetPreTrainedModel): self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) self.dropout = nn.Dropout(config.dropout) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.word_embedding @@ -1311,7 +1312,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): self.transformer = XLNetModel(config) self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.lm_loss @@ -1493,7 +1495,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1600,7 +1603,8 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): self.transformer = XLNetModel(config) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1697,7 +1701,8 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1800,7 +1805,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): self.transformer = XLNetModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1913,7 +1919,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): self.end_logits = PoolerEndLogits(config) self.answer_class = PoolerAnswerClass(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=XLNetForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index 10e7bc5995..7d0afd2d9c 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -777,7 +777,8 @@ class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelna self.embeddings = {{cookiecutter.camelcase_modelname}}Embeddings(config) self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -943,7 +944,8 @@ class {{cookiecutter.camelcase_modelname}}ForMaskedLM({{cookiecutter.camelcase_m self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) self.cls = {{cookiecutter.camelcase_modelname}}OnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1046,7 +1048,8 @@ class {{cookiecutter.camelcase_modelname}}ForCausalLM({{cookiecutter.camelcase_m self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) self.cls = {{cookiecutter.camelcase_modelname}}OnlyMLMHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1217,7 +1220,8 @@ class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutt self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) self.classifier = {{cookiecutter.camelcase_modelname}}ClassificationHead(config) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1309,7 +1313,8 @@ class {{cookiecutter.camelcase_modelname}}ForMultipleChoice({{cookiecutter.camel self.sequence_summary = SequenceSummary(config) self.classifier = nn.Linear(config.hidden_size, 1) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1399,7 +1404,8 @@ class {{cookiecutter.camelcase_modelname}}ForTokenClassification({{cookiecutter. self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1486,7 +1492,8 @@ class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.ca self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -2224,8 +2231,9 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}EncoderLayer(config) for _ in range(config.encoder_layers)]) self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def forward( self, @@ -2388,8 +2396,9 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}DecoderLayer(config) for _ in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.init_weights() self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.embed_tokens @@ -2640,7 +2649,8 @@ class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelna self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config, self.shared) self.decoder = {{cookiecutter.camelcase_modelname}}Decoder(config, self.shared) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.shared @@ -2755,7 +2765,8 @@ class {{cookiecutter.camelcase_modelname}}ForConditionalGeneration({{cookiecutte self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_encoder(self): return self.model.get_encoder() @@ -3170,7 +3181,8 @@ class {{cookiecutter.camelcase_modelname}}ForCausalLM({{cookiecutter.camelcase_m self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.init_weights() + # Initialize weights and apply final processing + self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 49027d3f7e..05c980c642 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -222,14 +222,6 @@ class ModelTesterMixin: config.gradient_checkpointing = True model = model_class(config) - # Model does not have gradient checkpointing activated yet, it will be done at the first forward. - self.assertFalse(model.is_gradient_checkpointing) - - model.to(torch_device) - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - _ = model(**inputs) - - # Model has gradient checkpointing activated after the first forward. self.assertTrue(model.is_gradient_checkpointing) def test_gradient_checkpointing_enable_disable(self):