From 4f3a54bfc8fa8749f6d5b29f110148738a646fcd Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 31 Oct 2019 16:37:34 +0000
Subject: [PATCH] ALBERT can load pre-trained models. Doesn't inherit from BERT
 anymore.

---
 transformers/__init__.py             |  2 +-
 transformers/configuration_albert.py |  9 ++++++
 transformers/modeling_albert.py      | 44 +++++++++++++++++++++++-----
 transformers/tokenization_albert.py  | 25 +++++++++++++---
 4 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 152d520e7b..bdfb1a0922 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -107,7 +107,7 @@ if is_torch_available():
                                 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
 
-    from .modeling_albert import (AlbertModel, AlbertForMaskedLM)
+    from .modeling_albert import (AlbertModel, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     # Optimization
     from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py
index b72bbb971e..c35426768f 100644
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -17,12 +17,21 @@
 
 from .configuration_utils import PretrainedConfig
 
+ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'albert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
+    'albert-large': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
+    'albert-xlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
+    'albert-xxlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
+}
+
 class AlbertConfig(PretrainedConfig):
     """Configuration for `AlbertModel`.
 
     The default settings match the configuration of model `albert_xxlarge`.
     """
 
+    pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
     def __init__(self,
                  vocab_size_or_config_json_file=30000,
                  embedding_size=128,
diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py
index 487455e561..4da10ed1cb 100644
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -21,6 +21,7 @@ import logging
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
+from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_albert import AlbertConfig
 from transformers.modeling_bert import BertEmbeddings, BertPreTrainedModel, BertModel, BertSelfAttention, prune_linear_layer, ACT2FN
 from .file_utils import add_start_docstrings
@@ -274,6 +275,29 @@ class AlbertTransformer(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
+
+class AlbertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = AlbertConfig
+    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "albert"
+
+    def _init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
 ALBERT_START_DOCSTRING = r"""    The ALBERT model was proposed in
     `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
     by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
@@ -338,7 +362,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
 
 @add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
                       ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
-class AlbertModel(BertModel):
+class AlbertModel(AlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
@@ -358,6 +382,12 @@ class AlbertModel(BertModel):
             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
     """
+
+    config_class = AlbertConfig
+    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_albert
+    base_model_prefix = "albert"
+
     def __init__(self, config):
         super(AlbertModel, self).__init__(config)
 
@@ -369,6 +399,11 @@ class AlbertModel(BertModel):
 
         self.init_weights()
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if attention_mask is None:
@@ -423,7 +458,7 @@ class AlbertMLMHead(nn.Module):
 
 
 @add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
-class AlbertForMaskedLM(BertPreTrainedModel):
+class AlbertForMaskedLM(AlbertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
@@ -445,11 +480,6 @@ class AlbertForMaskedLM(BertPreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
     """
 
-    config_class = AlbertConfig
-    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_albert
-    base_model_prefix = "albert"
-
     def __init__(self, config):
         super(AlbertForMaskedLM, self).__init__(config)
 
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 7cba99b9e4..acf67c1154 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -15,7 +15,7 @@
 """ Tokenization classes for ALBERT model."""
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
-                        
+
 from .tokenization_utils import PreTrainedTokenizer
 import logging
 import unicodedata
@@ -24,8 +24,25 @@ import os
 from shutil import copyfile
 
 logger = logging.getLogger(__name__)
-
 VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+    'albert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
+    'albert-large': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
+    'albert-xlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
+    'albert-xxlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'albert-base': 512,
+    'albert-large': 512,
+    'albert-xlarge': 512,
+    'albert-xxlarge': 512,
+}
+
 SPIECE_UNDERLINE = u'▁'
 
 class AlbertTokenizer(PreTrainedTokenizer):
@@ -35,8 +52,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
     vocab_files_names = VOCAB_FILES_NAMES
-    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file,
                  do_lower_case=True, remove_space=True, keep_accents=False,