From 8bf7312654d40cea1a399e86f9fe8e39e1ea3a1e Mon Sep 17 00:00:00 2001
From: Jared T Nielsen <jaredtnielsen@gmail.com>
Date: Thu, 7 May 2020 17:44:51 -0600
Subject: [PATCH] Add AlbertForPreTraining and TFAlbertForPreTraining models.
 (#4057)

* Add AlbertForPreTraining and TFAlbertForPreTraining models.

* PyTorch conversion

* TensorFlow conversion

* style

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
---
 src/transformers/__init__.py                  |   2 +
 ...lbert_original_tf_checkpoint_to_pytorch.py |   4 +-
 .../convert_pytorch_checkpoint_to_tf2.py      |  10 +-
 src/transformers/modeling_albert.py           | 125 +++++++++++++++++-
 src/transformers/modeling_auto.py             |   3 +-
 src/transformers/modeling_tf_albert.py        |  68 +++++++++-
 src/transformers/modeling_tf_auto.py          |   5 +-
 tests/test_modeling_albert.py                 |  31 ++++-
 tests/test_modeling_tf_albert.py              |  29 +++-
 9 files changed, 263 insertions(+), 14 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 78f9148531..f8cd78949b 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -287,6 +287,7 @@ if is_torch_available():
     from .modeling_albert import (
         AlbertPreTrainedModel,
         AlbertModel,
+        AlbertForPreTraining,
         AlbertForMaskedLM,
         AlbertForSequenceClassification,
         AlbertForQuestionAnswering,
@@ -490,6 +491,7 @@ if is_tf_available():
         TFAlbertPreTrainedModel,
         TFAlbertMainLayer,
         TFAlbertModel,
+        TFAlbertForPreTraining,
         TFAlbertForMaskedLM,
         TFAlbertForSequenceClassification,
         TFAlbertForQuestionAnswering,
diff --git a/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
index 88658d5a9f..4dd240be73 100644
--- a/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -20,7 +20,7 @@ import logging
 
 import torch
 
-from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
+from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
 
 
 logging.basicConfig(level=logging.INFO)
@@ -30,7 +30,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt
     # Initialise PyTorch model
     config = AlbertConfig.from_json_file(albert_config_file)
     print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = AlbertForMaskedLM(config)
+    model = AlbertForPreTraining(config)
 
     # Load weights from tf checkpoint
     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 1699af5884..084450de21 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -46,7 +46,7 @@ from transformers import (
     OpenAIGPTConfig,
     RobertaConfig,
     T5Config,
-    TFAlbertForMaskedLM,
+    TFAlbertForPreTraining,
     TFBertForPreTraining,
     TFBertForQuestionAnswering,
     TFBertForSequenceClassification,
@@ -109,7 +109,7 @@ if is_torch_available():
         DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         CTRLLMHeadModel,
         CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        AlbertForMaskedLM,
+        AlbertForPreTraining,
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5ForConditionalGeneration,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -148,7 +148,7 @@ else:
         DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         CTRLLMHeadModel,
         CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        AlbertForMaskedLM,
+        AlbertForPreTraining,
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5ForConditionalGeneration,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -318,8 +318,8 @@ MODEL_CLASSES = {
     ),
     "albert": (
         AlbertConfig,
-        TFAlbertForMaskedLM,
-        AlbertForMaskedLM,
+        TFAlbertForPreTraining,
+        AlbertForPreTraining,
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index 161e19c976..f6cd770cde 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -111,7 +111,8 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
 
         # No ALBERT model currently handles the next sentence prediction task
         if "seq_relationship" in name:
-            continue
+            name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
+            name = name.replace("weights", "weight")
 
         name = name.split("/")
 
@@ -568,6 +569,115 @@ class AlbertModel(AlbertPreTrainedModel):
         return outputs
 
 
+@add_start_docstrings(
+    """Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    a `sentence order prediction (classification)` head. """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForPreTraining(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.albert = AlbertModel(config)
+        self.predictions = AlbertMLMHead(config)
+        self.sop_classifier = AlbertSOPHead(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)
+
+    def get_output_embeddings(self):
+        return self.predictions.decoder
+
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        sentence_order_label=None,
+    ):
+        r"""
+        masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates original order (sequence A, then sequence B),
+            ``1`` indicates switched order (sequence B, then sequence A).
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False
+            continuation before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+
+    Examples::
+
+        from transformers import AlbertTokenizer, AlbertForPreTraining
+        import torch
+
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+
+        prediction_scores, sop_scores = outputs[:2]
+
+        """
+
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+
+        prediction_scores = self.predictions(sequence_output)
+        sop_scores = self.sop_classifier(pooled_output)
+
+        outputs = (prediction_scores, sop_scores,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if masked_lm_labels is not None and sentence_order_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
+            total_loss = masked_lm_loss + sentence_order_loss
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), prediction_scores, sop_scores, (hidden_states), (attentions)
+
+
 class AlbertMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -592,6 +702,19 @@ class AlbertMLMHead(nn.Module):
         return prediction_scores
 
 
+class AlbertSOPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, pooled_output):
+        dropout_pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(dropout_pooled_output)
+        return logits
+
+
 @add_start_docstrings(
     "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
 )
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index 82c1e25323..823e9ce59d 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -43,6 +43,7 @@ from .configuration_utils import PretrainedConfig
 from .modeling_albert import (
     ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     AlbertForMaskedLM,
+    AlbertForPreTraining,
     AlbertForQuestionAnswering,
     AlbertForSequenceClassification,
     AlbertForTokenClassification,
@@ -189,7 +190,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
     [
         (T5Config, T5ForConditionalGeneration),
         (DistilBertConfig, DistilBertForMaskedLM),
-        (AlbertConfig, AlbertForMaskedLM),
+        (AlbertConfig, AlbertForPreTraining),
         (CamembertConfig, CamembertForMaskedLM),
         (XLMRobertaConfig, XLMRobertaForMaskedLM),
         (BartConfig, BartForConditionalGeneration),
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index 72b257bafc..5982446250 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -475,7 +475,6 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
         hidden_states = self.activation(hidden_states)
         hidden_states = self.LayerNorm(hidden_states)
         hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
-        hidden_states = hidden_states + self.bias
         return hidden_states
 
 
@@ -718,6 +717,73 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         return outputs
 
 
+@add_start_docstrings(
+    """Albert Model with two heads on top for pre-training:
+    a `masked language modeling` head and a `sentence order prediction` (classification) head. """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.albert = TFAlbertMainLayer(config, name="albert")
+        self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
+        self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
+
+    def get_output_embeddings(self):
+        return self.albert.embeddings
+
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
+            Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import AlbertTokenizer, TFAlbertForPreTraining
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, sop_scores = outputs[:2]
+        """
+
+        outputs = self.albert(inputs, **kwargs)
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.predictions(sequence_output)
+        sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False))
+        outputs = (prediction_scores, sop_scores) + outputs[2:]
+        return outputs
+
+
+class TFAlbertSOPHead(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier",
+        )
+
+    def call(self, pooled_output, training: bool):
+        dropout_pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(dropout_pooled_output)
+        return logits
+
+
 @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     def __init__(self, config, *inputs, **kwargs):
diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py
index 2804368b33..c9d084346d 100644
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -36,6 +36,7 @@ from .configuration_utils import PretrainedConfig
 from .modeling_tf_albert import (
     TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     TFAlbertForMaskedLM,
+    TFAlbertForPreTraining,
     TFAlbertForQuestionAnswering,
     TFAlbertForSequenceClassification,
     TFAlbertModel,
@@ -132,7 +133,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
     [
         (T5Config, TFT5ForConditionalGeneration),
         (DistilBertConfig, TFDistilBertForMaskedLM),
-        (AlbertConfig, TFAlbertForMaskedLM),
+        (AlbertConfig, TFAlbertForPreTraining),
         (RobertaConfig, TFRobertaForMaskedLM),
         (BertConfig, TFBertForPreTraining),
         (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
@@ -412,7 +413,7 @@ class TFAutoModelForPreTraining(object):
         in the `pretrained_model_name_or_path` string (in the following order):
             - contains `t5`: :class:`~transformers.TFT5ModelWithLMHead` (T5 model)
             - contains `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.TFAlbertForMaskedLM` (ALBERT model)
+            - contains `albert`: :class:`~transformers.TFAlbertForPreTraining` (ALBERT model)
             - contains `roberta`: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
             - contains `bert`: :class:`~transformers.TFBertForPreTraining` (Bert model)
             - contains `openai-gpt`: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index 3e7f17acc5..9ac64c56c6 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -27,6 +27,7 @@ if is_torch_available():
     from transformers import (
         AlbertConfig,
         AlbertModel,
+        AlbertForPreTraining,
         AlbertForMaskedLM,
         AlbertForSequenceClassification,
         AlbertForTokenClassification,
@@ -38,7 +39,7 @@ if is_torch_available():
 @require_torch
 class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
 
-    all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
+    all_model_classes = (AlbertModel, AlbertForPreTraining, AlbertForMaskedLM) if is_torch_available() else ()
 
     class AlbertModelTester(object):
         def __init__(
@@ -151,6 +152,30 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
             )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
+        def create_and_check_albert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = AlbertForPreTraining(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, prediction_scores, sop_scores = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                sentence_order_label=sequence_labels,
+            )
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+                "sop_scores": sop_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels])
+            self.check_loss_output(result)
+
         def create_and_check_albert_for_masked_lm(
             self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
         ):
@@ -252,6 +277,10 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_model(*config_and_inputs)
 
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_pretraining(*config_and_inputs)
+
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py
index af5971494f..43beb4b709 100644
--- a/tests/test_modeling_tf_albert.py
+++ b/tests/test_modeling_tf_albert.py
@@ -26,6 +26,7 @@ from .utils import require_tf, slow
 if is_tf_available():
     from transformers.modeling_tf_albert import (
         TFAlbertModel,
+        TFAlbertForPreTraining,
         TFAlbertForMaskedLM,
         TFAlbertForSequenceClassification,
         TFAlbertForQuestionAnswering,
@@ -37,7 +38,13 @@ if is_tf_available():
 class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TFAlbertForQuestionAnswering)
+        (
+            TFAlbertModel,
+            TFAlbertForPreTraining,
+            TFAlbertForMaskedLM,
+            TFAlbertForSequenceClassification,
+            TFAlbertForQuestionAnswering,
+        )
         if is_tf_available()
         else ()
     )
@@ -153,6 +160,22 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
             )
             self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
+        def create_and_check_albert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = TFAlbertForPreTraining(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            prediction_scores, sop_scores = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+                "sop_scores": sop_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels])
+
         def create_and_check_albert_for_masked_lm(
             self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
         ):
@@ -216,6 +239,10 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_model(*config_and_inputs)
 
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_pretraining(*config_and_inputs)
+
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)