Adding the LXMERT pretraining model (MultiModal languageXvision) to HuggingFace's suite of models (#5793)

* added template files for LXMERT and competed the configuration_lxmert.py * added modeling, tokization, testing, and finishing touched for lxmert [yet to be tested] * added model card for lxmert * cleaning up lxmert code * Update src/transformers/modeling_lxmert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/modeling_tf_lxmert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/modeling_tf_lxmert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/modeling_lxmert.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * tested torch lxmert, changed documtention, updated outputs, and other small fixes * Update src/transformers/convert_pytorch_checkpoint_to_tf2.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/convert_pytorch_checkpoint_to_tf2.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/convert_pytorch_checkpoint_to_tf2.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * renaming, other small issues, did not change TF code in this commit * added lxmert question answering model in pytorch * added capability to edit number of qa labels for lxmert * made answer optional for lxmert question answering * add option to return hidden_states for lxmert * changed default qa labels for lxmert * changed config archive path * squshing 3 commits: merged UI + testing improvments + more UI and testing * changed some variable names for lxmert * TF LXMERT * Various fixes to LXMERT * Final touches to LXMERT * AutoTokenizer order * Add LXMERT to index.rst and README.md * Merge commit test fixes + Style update * TensorFlow 2.3.0 sequential model changes variable names Remove inherited test * Update src/transformers/modeling_tf_pytorch_utils.py * Update docs/source/model_doc/lxmert.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update docs/source/model_doc/lxmert.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_lxmert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * added suggestions * Fixes * Final fixes for TF model * Fix docs Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-09-03 04:02:25 -04:00
parent 4ebb52afdb
commit ea2c6f1afc
23 changed files with 4798 additions and 12 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -31,6 +31,7 @@ from .configuration_encoder_decoder import EncoderDecoderConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
+from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
 from .configuration_marian import MarianConfig
 from .configuration_mbart import MBartConfig
 from .configuration_mmbt import MMBTConfig
@@ -156,6 +157,7 @@ from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
+from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
 from .tokenization_mbart import MBartTokenizer
 from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
@@ -343,6 +345,15 @@ if is_torch_available():
        LongformerModel,
        LongformerSelfAttention,
    )
+    from .modeling_lxmert import (
+        LxmertEncoder,
+        LxmertForPreTraining,
+        LxmertForQuestionAnswering,
+        LxmertModel,
+        LxmertPreTrainedModel,
+        LxmertVisualFeatureEncoder,
+        LxmertXLayer,
+    )
    from .modeling_marian import MarianMTModel
    from .modeling_mbart import MBartForConditionalGeneration
    from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
@@ -573,6 +584,14 @@ if is_tf_available():
        TFLongformerModel,
        TFLongformerSelfAttention,
    )
+    from .modeling_tf_lxmert import (
+        TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFLxmertForPreTraining,
+        TFLxmertMainLayer,
+        TFLxmertModel,
+        TFLxmertPreTrainedModel,
+        TFLxmertVisualFeatureEncoder,
+    )
    from .modeling_tf_mobilebert import (
        TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFMobileBertForMaskedLM,
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -155,5 +155,13 @@ class ConvertCommand(BaseTransformersCLICommand):
            )

            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
+        elif self._model_type == "lxmert":
+            from transformers.convert_lxmert_original_pytorch_checkpoint_to_pytorch import (
+                convert_lxmert_checkpoint_to_pytorch,
+            )
+
+            convert_lxmert_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
        else:
-            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")
+            raise ValueError(
+                "--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm, lxmert]"
+            )
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -28,6 +28,7 @@ from .configuration_encoder_decoder import EncoderDecoderConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
+from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
 from .configuration_marian import MarianConfig
 from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
 from .configuration_mobilebert import MobileBertConfig
@@ -66,6 +67,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
        LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ]
    for key, value, in pretrained_map.items()
 )
@@ -166,6 +168,10 @@ CONFIG_MAPPING = OrderedDict(
            "encoder-decoder",
            EncoderDecoderConfig,
        ),
+        (
+            "lxmert",
+            LxmertConfig,
+        ),
    ]
 )

--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2018, Hao Tan, Mohit Bansal
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LXMERT model configuration """
+
+
+import logging
+
+from .configuration_utils import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "unc-nlp/lxmert-base-uncased": "",
+}
+
+
+class LxmertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+    It is used to instantiate an Lxmert model according to the specified arguments, defining the model
+    architecture.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        r_layers (:obj:`int`, optional, defaults to 5):
+            Number of hidden layers in the Transformer visual encoder.
+        l_layers (:obj:`int`, optional, defaults to 9):
+            Number of hidden layers in the Transformer language encoder.
+        x_layers (:obj:`int`, optional, defaults to 5):
+            Number of hidden layers in the Transformer cross modality encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 5):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        visual_feat_dim (:obj:`int`, optional, defaults to 2048):
+            This represents the last dimension of the pooled-object features used as input for the model,
+            representing the size of each object feature itself.
+        visual_pos_dim (:obj:`int`, optional, defaults to 4):
+            This represents the number of spacial features that are mixed into the visual features.
+            The default is set to 4 because most commonly this will represent the location of a bounding box.
+            i.e. (x, y, width, height)
+        visual_loss_normalizer (:obj:`float`, optional, defaults to 1/15):
+            This represents the scaling factor in which each visual loss is multiplied by if during pretraining,
+            one decided to train with multiple vision-based loss objectives.
+        num_qa_labels (:obj:`int`, optional, defaults to 9500):
+            This represents the total number of different question answering (QA) labels there are. If using more than one dataset with QA,
+            the user will need to account for the total number of labels that all of the datasets have in total.
+        num_object_labels (:obj:`int`, optional, defaults to 1600):
+            This represents the total number of semantically unique objects that lxmert will be able to classify a pooled-object feature
+            as belonging too.
+        num_attr_labels (:obj:`int`, optional, defaults to 400):
+            This represents the total number of semantically unique attributes that lxmert will be able to classify a pooled-object feature
+            as possessing.
+        task_matched (:obj:`bool`, optional, defaults to True):
+            This task is used for sentence-image matching. If the sentence correctly describes the image the label will be 1.
+            If the sentence does not correctly describe the image, the label will be 0.
+        task_mask_lm (:obj:`bool`, optional, defaults to True):
+            This task is the defacto masked langauge modeling used in pretraining models such as BERT.
+        task_obj_predict (:obj:`bool`, optional, defaults to True):
+            This task is set to true if the user would like to perform one of the following loss objectives:
+            object predicition, atrribute predicition, feature regression
+        task_qa (:obj:`bool`, optional, defaults to True):
+            This task specifies whether or not Lxmert will calculate the question-asnwering loss objective
+        visual_obj_loss (:obj:`bool`, optional, defaults to True):
+            This task specifies whether or not Lxmert will calculate the object-prediction loss objective
+        visual_attr_loss (:obj:`bool`, optional, defaults to True):
+            This task specifies whether or not Lxmert will calculate the attribute-prediction loss objective
+        visual_feat_loss (:obj:`bool`, optional, defaults to True):
+            This task specifies whether or not Lxmert will calculate the feature-regression loss objective
+        output_attentions (:obj:`bool`, optional, defaults to False):
+                if True, the vision, langauge, and cross-modality layers will be returned
+        output_hidden_states (:obj:`bool`, optional, defaults to False):
+                if True, final cross-modality hidden states for language and vision features will be returned
+
+    """
+
+    model_type = "lxmert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_attention_heads=12,
+        num_labels=2,
+        num_qa_labels=9500,
+        num_object_labels=1600,
+        num_attr_labels=400,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        l_layers=9,
+        x_layers=5,
+        r_layers=5,
+        visual_feat_dim=2048,
+        visual_pos_dim=4,
+        visual_loss_normalizer=6.67,
+        task_matched=True,
+        task_mask_lm=True,
+        task_obj_predict=True,
+        task_qa=True,
+        visual_obj_loss=True,
+        visual_attr_loss=True,
+        visual_feat_loss=True,
+        output_attentions=False,
+        output_hidden_states=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_labels = num_labels
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.num_qa_labels = num_qa_labels
+        self.num_object_labels = num_object_labels
+        self.num_attr_labels = num_attr_labels
+        self.l_layers = l_layers
+        self.x_layers = x_layers
+        self.r_layers = r_layers
+        self.visual_feat_dim = visual_feat_dim
+        self.visual_pos_dim = visual_pos_dim
+        self.visual_loss_normalizer = visual_loss_normalizer
+        self.task_matched = task_matched
+        self.task_mask_lm = task_mask_lm
+        self.task_obj_predict = task_obj_predict
+        self.task_qa = task_qa
+        self.visual_obj_loss = visual_obj_loss
+        self.visual_attr_loss = visual_attr_loss
+        self.visual_feat_loss = visual_feat_loss
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = self.output_attentions
+        self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
--- a/src/transformers/convert_lxmert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_lxmert_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert LXMERT checkpoint."""
+
+
+import argparse
+import logging
+
+import torch
+
+from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = LxmertConfig.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = LxmertForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -27,6 +27,7 @@ from transformers import (
    ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
    FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
    T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -43,6 +44,7 @@ from transformers import (
    ElectraConfig,
    FlaubertConfig,
    GPT2Config,
+    LxmertConfig,
    OpenAIGPTConfig,
    RobertaConfig,
    T5Config,
@@ -57,6 +59,8 @@ from transformers import (
    TFElectraForPreTraining,
    TFFlaubertWithLMHeadModel,
    TFGPT2LMHeadModel,
+    TFLxmertForPreTraining,
+    TFLxmertVisualFeatureEncoder,
    TFOpenAIGPTLMHeadModel,
    TFRobertaForMaskedLM,
    TFRobertaForSequenceClassification,
@@ -94,6 +98,8 @@ if is_torch_available():
        ElectraForPreTraining,
        FlaubertWithLMHeadModel,
        GPT2LMHeadModel,
+        LxmertForPreTraining,
+        LxmertVisualFeatureEncoder,
        OpenAIGPTLMHeadModel,
        RobertaForMaskedLM,
        RobertaForSequenceClassification,
@@ -204,6 +210,18 @@ MODEL_CLASSES = {
        DistilBertForQuestionAnswering,
        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
+    "lxmert": (
+        LxmertConfig,
+        TFLxmertForPreTraining,
+        LxmertForPreTraining,
+        LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "lxmert-visual-feature-encoder": (
+        LxmertConfig,
+        TFLxmertVisualFeatureEncoder,
+        LxmertVisualFeatureEncoder,
+        LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "ctrl": (
        CTRLConfig,
        TFCTRLLMHeadModel,
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -31,6 +31,7 @@ from .configuration_auto import (
    FlaubertConfig,
    GPT2Config,
    LongformerConfig,
+    LxmertConfig,
    MBartConfig,
    MobileBertConfig,
    OpenAIGPTConfig,
@@ -116,6 +117,7 @@ from .modeling_longformer import (
    LongformerForTokenClassification,
    LongformerModel,
 )
+from .modeling_lxmert import LxmertForPreTraining, LxmertModel
 from .modeling_marian import MarianMTModel
 from .modeling_mbart import MBartForConditionalGeneration
 from .modeling_mobilebert import (
@@ -200,6 +202,7 @@ MODEL_MAPPING = OrderedDict(
        (CTRLConfig, CTRLModel),
        (ElectraConfig, ElectraModel),
        (ReformerConfig, ReformerModel),
+        (LxmertConfig, LxmertModel),
    ]
 )

@@ -224,6 +227,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
        (XLMConfig, XLMWithLMHeadModel),
        (CTRLConfig, CTRLLMHeadModel),
        (ElectraConfig, ElectraForPreTraining),
+        (LxmertConfig, LxmertForPreTraining),
    ]
 )

--- a/src/transformers/modeling_lxmert.py
+++ b/src/transformers/modeling_lxmert.py
--- a/src/transformers/modeling_tf_lxmert.py
+++ b/src/transformers/modeling_tf_lxmert.py
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -883,7 +883,7 @@ MOBILEBERT_START_DOCSTRING = r"""

 MOBILEBERT_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`):
+        input_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`{0}`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.MobileBertTokenizer`.
@@ -891,28 +891,28 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
            :func:`transformers.PreTrainedTokenizer.__call__` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+        attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+        token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`__
-        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+        position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`__
-        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+        head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
+        inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can  to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -191,7 +191,7 @@ class TFSequenceClassificationLoss:
    """

    def compute_loss(self, labels, logits):
-        if shape_list(logits)[1] == 1:
+        if len(shape_list(logits)) == 1 or shape_list(logits)[1] == 1:
            loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
        else:
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -29,6 +29,7 @@ from .configuration_auto import (
    FlaubertConfig,
    GPT2Config,
    LongformerConfig,
+    LxmertConfig,
    MarianConfig,
    MBartConfig,
    MobileBertConfig,
@@ -55,6 +56,7 @@ from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
+from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
 from .tokenization_marian import MarianTokenizer
 from .tokenization_mbart import MBartTokenizer
 from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
@@ -91,6 +93,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
        (ReformerConfig, (ReformerTokenizer, None)),
        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
+        (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
        (BertConfig, (BertTokenizer, BertTokenizerFast)),
        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
        (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
@@ -128,6 +131,7 @@ class AutoTokenizer:
        - `xlm`: XLMTokenizer (XLM model)
        - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
        - `electra`: ElectraTokenizer (Google ELECTRA model)
+        - `lxmert`: LxmertTokenizer (Lxmert model)

    This class cannot be instantiated using `__init__()` (throw an error).
    """
@@ -163,6 +167,7 @@ class AutoTokenizer:
            - `xlm`: XLMTokenizer (XLM model)
            - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
            - `electra`: ElectraTokenizer (Google ELECTRA model)
+            - `lxmert`: LxmertTokenizer (Lxmert model)

        Params:
            pretrained_model_name_or_path: either:
--- a/src/transformers/tokenization_lxmert.py
+++ b/src/transformers/tokenization_lxmert.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tokenization_bert import BertTokenizer, BertTokenizerFast
+
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    }
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "unc-nlp/lxmert-base-uncased": 512,
+}
+####################################################
+# Mapping from model shortcut names to a dictionary of additional
+# keyword arguments for Tokenizer `__init__`.
+# To be used for checkpoint specific configurations.
+####################################################
+PRETRAINED_INIT_CONFIGURATION = {
+    "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
+}
+
+
+class LxmertTokenizer(BertTokenizer):
+    r"""
+    Constructs an Lxmert tokenizer.
+    :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+
+class LxmertTokenizerFast(BertTokenizerFast):
+    r"""
+    Constructs a "Fast" Lxmert Fast tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION