From 7da995c00c025c4180c7fb0357256b7f83d342ef Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 5 Mar 2021 16:18:48 -0500 Subject: [PATCH] Fix embeddings for PyTorch 1.8 (#10549) * Fix embeddings for PyTorch 1.8 * Try with PyTorch 1.8.0 * Fix embeddings init * Fix copies * Typo * More typos --- .circleci/config.yml | 9 +++------ src/transformers/models/albert/modeling_albert.py | 8 ++++++-- src/transformers/models/bert/modeling_bert.py | 10 +++++++--- .../bert_generation/modeling_bert_generation.py | 10 +++++++--- .../models/convbert/modeling_convbert.py | 10 +++++++--- src/transformers/models/ctrl/modeling_ctrl.py | 8 ++++++-- src/transformers/models/deberta/modeling_deberta.py | 12 ++++++++---- .../models/deberta_v2/modeling_deberta_v2.py | 12 ++++++++---- .../models/distilbert/modeling_distilbert.py | 13 ++++++++----- src/transformers/models/electra/modeling_electra.py | 10 +++++++--- src/transformers/models/funnel/modeling_funnel.py | 2 ++ src/transformers/models/gpt2/modeling_gpt2.py | 8 ++++++-- src/transformers/models/ibert/modeling_ibert.py | 10 +++++++--- .../models/layoutlm/modeling_layoutlm.py | 10 +++++++--- .../models/longformer/modeling_longformer.py | 10 +++++++--- src/transformers/models/lxmert/modeling_lxmert.py | 10 +++++++--- .../models/mobilebert/modeling_mobilebert.py | 10 +++++++--- src/transformers/models/mpnet/modeling_mpnet.py | 10 +++++++--- src/transformers/models/openai/modeling_openai.py | 8 ++++++-- .../models/reformer/modeling_reformer.py | 7 ++++--- .../models/retribert/modeling_retribert.py | 10 +++++++--- src/transformers/models/roberta/modeling_roberta.py | 10 +++++++--- .../models/squeezebert/modeling_squeezebert.py | 10 +++++++--- src/transformers/models/tapas/modeling_tapas.py | 10 +++++++--- src/transformers/models/xlm/modeling_xlm.py | 4 +++- src/transformers/models/xlnet/modeling_xlnet.py | 8 ++++++-- ...modeling_{{cookiecutter.lowercase_modelname}}.py | 10 +++++++--- 27 files changed, 171 insertions(+), 78 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b3e5175877..fe85b7aaa2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -79,8 +79,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece] - - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html - - run: pip install -U torch==1.7.1 + - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html - save_cache: key: v0.4-{{ checksum "setup.py" }} paths: @@ -107,8 +106,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,torch,testing,sentencepiece] - - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html - - run: pip install -U torch==1.7.1 + - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html - save_cache: key: v0.4-torch-{{ checksum "setup.py" }} paths: @@ -187,8 +185,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,torch,testing,sentencepiece] - - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html - - run: pip install -U torch==1.7.1 + - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html - save_cache: key: v0.4-torch-{{ checksum "setup.py" }} paths: diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 325e95e195..baceeffcce 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -490,12 +490,16 @@ class AlbertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, (nn.Linear)) and module.bias is not None: + if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index aaf4c55db0..88bb089bfb 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -704,15 +704,19 @@ class BertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() @dataclass diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 0f89efb385..0febbdc88f 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -177,15 +177,19 @@ class BertGenerationPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() BERT_GENERATION_START_DOCSTRING = r""" diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index ccc352361c..7b38431185 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -238,15 +238,19 @@ class ConvBertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, (nn.Linear)) and module.bias is not None: - module.bias.data.zero_() class SeparableConv1D(nn.Module): diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index da50095408..144da74a8c 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -221,12 +221,16 @@ class CTRLPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): + if isinstance(module, (nn.Linear, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: + if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index d83a595946..3d24b9e630 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -767,13 +767,17 @@ class DebertaPreTrainedModel(PreTrainedModel): self._register_load_state_dict_pre_hook(self._pre_load_hook) def _init_weights(self, module): - """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + """Initialize the weights.""" + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() def _pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """ diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 4fd6131e45..63b2b01e59 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -886,13 +886,17 @@ class DebertaV2PreTrainedModel(PreTrainedModel): self._register_load_state_dict_pre_hook(self._pre_load_hook) def _init_weights(self, module): - """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + """Initialize the weights.""" + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() def _pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """ diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index e478c588b4..5a8cb719e8 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -341,16 +341,19 @@ class DistilBertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, nn.Embedding): - if module.weight.requires_grad: - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() DISTILBERT_START_DOCSTRING = r""" diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 9de0be3c49..50bec0f98e 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -653,15 +653,19 @@ class ElectraPreTrainedModel(PreTrainedModel): # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() @dataclass diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 04d2c8c332..a48f7e01b5 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -779,6 +779,8 @@ class FunnelPreTrainedModel(PreTrainedModel): elif classname == "FunnelEmbeddings": std = 1.0 if self.config.initializer_std is None else self.config.initializer_std nn.init.normal_(module.word_embeddings.weight, std=std) + if module.word_embeddings.padding_idx is not None: + module.word_embeddings.weight.data[module.padding_idx].zero_() class FunnelClassificationHead(nn.Module): diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 60eb3d69a2..52937e0ead 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -345,12 +345,16 @@ class GPT2PreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): + if isinstance(module, (nn.Linear, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: + if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 2a854e6ff7..edc4747a5c 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -645,15 +645,19 @@ class IBertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (QuantLinear, QuantEmbedding, nn.Linear, nn.Embedding)): + if isinstance(module, (QuantLinear, nn.Linear)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (QuantEmbedding, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, (IntLayerNorm, nn.LayerNorm)): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, (QuantLinear, nn.Linear)) and module.bias is not None: - module.bias.data.zero_() def resize_token_embeddings(self, new_num_tokens=None): raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.") diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 96046c07da..8d6d0a7d15 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -612,15 +612,19 @@ class LayoutLMPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, LayoutLMLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() LAYOUTLM_START_DOCSTRING = r""" diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index df850524f8..b254e3a5fd 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1363,15 +1363,19 @@ class LongformerPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() LONGFORMER_START_DOCSTRING = r""" diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index d83add4585..22cc0bc839 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -783,15 +783,19 @@ class LxmertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() LXMERT_START_DOCSTRING = r""" diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index becfc29c16..4cfc115d04 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -670,15 +670,19 @@ class MobileBertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, (nn.LayerNorm, NoNorm)): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() @dataclass diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 9f5814ba7d..d8a6f311ad 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -56,15 +56,19 @@ class MPNetPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() class MPNetEmbeddings(nn.Module): diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 2d41461382..69ff6e676d 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -283,12 +283,16 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): + if isinstance(module, (nn.Linear, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: + if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index f2bb57b457..0f8caa8bef 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1791,16 +1791,17 @@ class ReformerPreTrainedModel(PreTrainedModel): torch.nn.init.normal_(weight, std=self.config.axial_norm_std) elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - + if module.bias is not None: + module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() @dataclass diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py index 2e6c23c241..0b6023e7bc 100644 --- a/src/transformers/models/retribert/modeling_retribert.py +++ b/src/transformers/models/retribert/modeling_retribert.py @@ -51,13 +51,17 @@ class RetriBertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() RETRIBERT_START_DOCSTRING = r""" diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 1760617044..8297a0a855 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -574,15 +574,19 @@ class RobertaPreTrainedModel(PreTrainedModel): # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() ROBERTA_START_DOCSTRING = r""" diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 072dd17dc7..d298a6d71a 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -432,15 +432,19 @@ class SqueezeBertPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Conv1d, nn.Embedding)): + if isinstance(module, (nn.Linear, nn.Conv1d)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, SqueezeBertLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None: - module.bias.data.zero_() SQUEEZEBERT_START_DOCSTRING = r""" diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index b9f67d4a2c..cecdd7b4e1 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -700,15 +700,19 @@ class TapasPreTrainedModel(PreTrainedModel): # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() TAPAS_START_DOCSTRING = r""" diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index 47c53a9bda..bb7dddb287 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -254,10 +254,12 @@ class XLMPreTrainedModel(PreTrainedModel): if isinstance(module, nn.Embedding): if self.config is not None and self.config.embed_init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() if isinstance(module, nn.Linear): if self.config is not None and self.config.init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.init_std) - if hasattr(module, "bias") and module.bias is not None: + if module.bias is not None: nn.init.constant_(module.bias, 0.0) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index e873996d36..74874a9f05 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -552,12 +552,16 @@ class XLNetPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if isinstance(module, nn.Linear) and module.bias is not None: + if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index b63486ec5e..9e120402d4 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -656,15 +656,19 @@ class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""