From 3f94170a1048bbcff77b222a708470e482fdaff8 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Wed, 29 Jul 2020 14:26:26 -0400
Subject: [PATCH] =?UTF-8?q?[WIP]=20Test=20TF=20Flaubert=20+=20Add=20{XLM,?=
 =?UTF-8?q?=20Flaubert}{TokenClassification,=20MultipleC=E2=80=A6=20(#5614?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleChoice} models and tests

* AutoModels


Tiny tweaks

* Style

* Final changes before merge

* Re-order for simpler review

* Final fixes

* Addressing @sgugger's comments

* Test MultipleChoice
---
 src/transformers/__init__.py             |   3 +
 src/transformers/modeling_auto.py        |   5 +
 src/transformers/modeling_flaubert.py    |  20 ++
 src/transformers/modeling_tf_flaubert.py |  36 ++-
 src/transformers/modeling_tf_xlm.py      |  30 +-
 src/transformers/modeling_xlm.py         | 104 +++++++
 tests/test_modeling_common.py            |   2 +-
 tests/test_modeling_flaubert.py          |  47 ++++
 tests/test_modeling_tf_common.py         |   4 +-
 tests/test_modeling_tf_flaubert.py       | 331 ++++++++++++++++++++++-
 tests/test_modeling_tf_xlm.py            |  41 ++-
 tests/test_modeling_xlm.py               |  50 +++-
 12 files changed, 652 insertions(+), 21 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e17b97240d..a0fc396e51 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -278,6 +278,7 @@ if is_torch_available():
         XLMForTokenClassification,
         XLMForQuestionAnswering,
         XLMForQuestionAnsweringSimple,
+        XLMForMultipleChoice,
         XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
     from .modeling_bart import (
@@ -356,6 +357,8 @@ if is_torch_available():
         FlaubertForTokenClassification,
         FlaubertForQuestionAnswering,
         FlaubertForQuestionAnsweringSimple,
+        FlaubertForTokenClassification,
+        FlaubertForMultipleChoice,
         FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index b3dc19fc1c..5f6ad671ed 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -98,6 +98,7 @@ from .modeling_electra import (
 )
 from .modeling_encoder_decoder import EncoderDecoderModel
 from .modeling_flaubert import (
+    FlaubertForMultipleChoice,
     FlaubertForQuestionAnsweringSimple,
     FlaubertForSequenceClassification,
     FlaubertForTokenClassification,
@@ -142,6 +143,7 @@ from .modeling_roberta import (
 from .modeling_t5 import T5ForConditionalGeneration, T5Model
 from .modeling_transfo_xl import TransfoXLLMHeadModel, TransfoXLModel
 from .modeling_xlm import (
+    XLMForMultipleChoice,
     XLMForQuestionAnsweringSimple,
     XLMForSequenceClassification,
     XLMForTokenClassification,
@@ -338,6 +340,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
         (XLNetConfig, XLNetForTokenClassification),
         (AlbertConfig, AlbertForTokenClassification),
         (ElectraConfig, ElectraForTokenClassification),
+        (FlaubertConfig, FlaubertForTokenClassification),
     ]
 )
 
@@ -353,6 +356,8 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
         (MobileBertConfig, MobileBertForMultipleChoice),
         (XLNetConfig, XLNetForMultipleChoice),
         (AlbertConfig, AlbertForMultipleChoice),
+        (XLMConfig, XLMForMultipleChoice),
+        (FlaubertConfig, FlaubertForMultipleChoice),
     ]
 )
 
diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py
index aeda892f7f..5d0ebf27fc 100644
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -25,6 +25,7 @@ from .configuration_flaubert import FlaubertConfig
 from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_outputs import BaseModelOutput
 from .modeling_xlm import (
+    XLMForMultipleChoice,
     XLMForQuestionAnswering,
     XLMForQuestionAnsweringSimple,
     XLMForSequenceClassification,
@@ -382,3 +383,22 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
         super().__init__(config)
         self.transformer = FlaubertModel(config)
         self.init_weights()
+
+
+@add_start_docstrings(
+    """Flaubert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    FLAUBERT_START_DOCSTRING,
+)
+class FlaubertForMultipleChoice(XLMForMultipleChoice):
+    """
+    This class overrides :class:`~transformers.XLMForMultipleChoice`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+    """
+
+    config_class = FlaubertConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FlaubertModel(config)
+        self.init_weights()
diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py
index d10324de08..cf721be25c 100644
--- a/src/transformers/modeling_tf_flaubert.py
+++ b/src/transformers/modeling_tf_flaubert.py
@@ -22,7 +22,7 @@ import tensorflow as tf
 
 from .configuration_flaubert import FlaubertConfig
 from .file_utils import add_start_docstrings
-from .modeling_tf_utils import keras_serializable, shape_list
+from .modeling_tf_utils import cast_bool_to_primitive, keras_serializable, shape_list
 from .modeling_tf_xlm import (
     TFXLMForMultipleChoice,
     TFXLMForQuestionAnsweringSimple,
@@ -30,6 +30,7 @@ from .modeling_tf_xlm import (
     TFXLMForTokenClassification,
     TFXLMMainLayer,
     TFXLMModel,
+    TFXLMPredLayer,
     TFXLMWithLMHeadModel,
     get_masks,
 )
@@ -123,6 +124,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
         super().__init__(config, *inputs, **kwargs)
         self.layerdrop = getattr(config, "layerdrop", 0.0)
         self.pre_norm = getattr(config, "pre_norm", False)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
 
     def call(
         self,
@@ -135,9 +138,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
         cache=None,
         head_mask=None,
         inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
         training=False,
-        output_attentions=False,
-        output_hidden_states=False,
     ):
         # removed: src_enc=None, src_len=None
         if isinstance(inputs, (tuple, list)):
@@ -150,7 +153,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
             cache = inputs[6] if len(inputs) > 6 else cache
             head_mask = inputs[7] if len(inputs) > 7 else head_mask
             inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
-            assert len(inputs) <= 9, "Too many inputs."
+            output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
+            output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
+            assert len(inputs) <= 11, "Too many inputs."
         elif isinstance(inputs, (dict, BatchEncoding)):
             input_ids = inputs.get("input_ids")
             attention_mask = inputs.get("attention_mask", attention_mask)
@@ -161,10 +166,15 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
             cache = inputs.get("cache", cache)
             head_mask = inputs.get("head_mask", head_mask)
             inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 9, "Too many inputs."
+            output_attentions = inputs.get("output_attentions", output_attentions)
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 11, "Too many inputs."
         else:
             input_ids = inputs
 
+        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
+
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -257,9 +267,12 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
 
             # self attention
             if not self.pre_norm:
-                attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training)
+                attn_outputs = self.attentions[i](
+                    [tensor, attn_mask, None, cache, head_mask[i], output_attentions], training=training
+                )
                 attn = attn_outputs[0]
-                attentions = attentions + (attn_outputs[1],)
+                if cast_bool_to_primitive(output_attentions, self.output_attentions) is True:
+                    attentions = attentions + (attn_outputs[1],)
                 attn = self.dropout(attn, training=training)
                 tensor = tensor + attn
                 tensor = self.layer_norm1[i](tensor)
@@ -269,7 +282,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
                     [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training
                 )
                 attn = attn_outputs[0]
-                if output_attentions:
+                if cast_bool_to_primitive(output_attentions, self.output_attentions) is True:
                     attentions = attentions + (attn_outputs[1],)
                 attn = self.dropout(attn, training=training)
                 tensor = tensor + attn
@@ -292,7 +305,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
             tensor = tensor * mask[..., tf.newaxis]
 
         # Add last hidden state
-        if output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True:
             hidden_states = hidden_states + (tensor,)
 
         # update cache length
@@ -303,9 +316,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
         # tensor = tensor.transpose(0, 1)
 
         outputs = (tensor,)
-        if output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True:
             outputs = outputs + (hidden_states,)
-        if output_attentions:
+        if cast_bool_to_primitive(output_attentions, self.output_attentions) is True:
             outputs = outputs + (attentions,)
         return outputs  # outputs, (hidden_states), (attentions)
 
@@ -321,6 +334,7 @@ class TFFlaubertWithLMHeadModel(TFXLMWithLMHeadModel):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
 
 
 @add_start_docstrings(
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index e912891c21..7a5f029e56 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -19,6 +19,7 @@
 import itertools
 import logging
 import math
+import warnings
 
 import numpy as np
 import tensorflow as tf
@@ -827,6 +828,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
 
         self.transformer = TFXLMMainLayer(config, name="transformer")
         self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+        self.logits_proj = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
 
     @property
     def dummy_inputs(self):
@@ -835,7 +839,10 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
         Returns:
             tf.Tensor with dummy inputs
         """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+        return {
+            "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS),
+            "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS),
+        }
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
@@ -892,7 +899,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
             output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
             output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
             labels = inputs[11] if len(inputs) > 11 else labels
-            assert len(inputs) <= 11, "Too many inputs."
+            assert len(inputs) <= 12, "Too many inputs."
         elif isinstance(inputs, (dict, BatchEncoding)):
             input_ids = inputs.get("input_ids")
             attention_mask = inputs.get("attention_mask", attention_mask)
@@ -921,17 +928,31 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
         flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            warnings.warn(
+                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
+                "attention mask instead.",
+                FutureWarning,
+            )
+            lengths = None
 
         flat_inputs = [
             flat_input_ids,
             flat_attention_mask,
-            langs,
+            flat_langs,
             flat_token_type_ids,
             flat_position_ids,
             lengths,
             cache,
             head_mask,
-            inputs_embeds,
+            flat_inputs_embeds,
             output_attentions,
             output_hidden_states,
         ]
@@ -939,6 +960,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
         transformer_outputs = self.transformer(flat_inputs, training=training)
         output = transformer_outputs[0]
         logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
         reshaped_logits = tf.reshape(logits, (-1, num_choices))
 
         outputs = (reshaped_logits,) + transformer_outputs[1:]  # add hidden states and attention if they are here
diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py
index e7396df689..9a366cee6b 100644
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -19,6 +19,7 @@
 import itertools
 import logging
 import math
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -40,6 +41,7 @@ from .file_utils import (
 from .modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
+    MultipleChoiceModelOutput,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
@@ -1122,3 +1124,105 @@ class XLMForTokenClassification(XLMPreTrainedModel):
         return TokenClassifierOutput(
             loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """XLM Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM_START_DOCSTRING,
+)
+class XLMForMultipleChoice(XLMPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = XLMModel(config)
+        self.sequence_summary = SequenceSummary(config)
+        self.logits_proj = nn.Linear(config.num_labels, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="xlm-mlm-en-2048",
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_tuple=None,
+    ):
+        r"""
+        labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+        """
+        return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        langs = langs.view(-1, langs.size(-1)) if langs is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            warnings.warn(
+                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
+                "attention mask instead.",
+                FutureWarning,
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_tuple=return_tuple,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if return_tuple:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 097c387543..f6841cb844 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -66,7 +66,7 @@ class ModelTesterMixin:
         if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
             return {
                 k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
-                if isinstance(v, torch.Tensor) and v.ndim != 0
+                if isinstance(v, torch.Tensor) and v.ndim > 1
                 else v
                 for k, v in inputs_dict.items()
             }
diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py
index af2918cb94..d4342e2184 100644
--- a/tests/test_modeling_flaubert.py
+++ b/tests/test_modeling_flaubert.py
@@ -32,6 +32,7 @@ if is_torch_available():
         FlaubertForQuestionAnsweringSimple,
         FlaubertForSequenceClassification,
         FlaubertForTokenClassification,
+        FlaubertForMultipleChoice,
     )
     from transformers.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -90,6 +91,7 @@ class FlaubertModelTester(object):
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
             is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
         config = FlaubertConfig(
             vocab_size=self.vocab_size,
@@ -118,6 +120,7 @@ class FlaubertModelTester(object):
             sequence_labels,
             token_labels,
             is_impossible_labels,
+            choice_labels,
             input_mask,
         )
 
@@ -133,6 +136,7 @@ class FlaubertModelTester(object):
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = FlaubertModel(config=config)
@@ -158,6 +162,7 @@ class FlaubertModelTester(object):
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = FlaubertWithLMHeadModel(config)
@@ -183,6 +188,7 @@ class FlaubertModelTester(object):
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = FlaubertForQuestionAnsweringSimple(config)
@@ -212,6 +218,7 @@ class FlaubertModelTester(object):
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = FlaubertForQuestionAnswering(config)
@@ -278,6 +285,7 @@ class FlaubertModelTester(object):
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = FlaubertForSequenceClassification(config)
@@ -304,6 +312,7 @@ class FlaubertModelTester(object):
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         config.num_labels = self.num_labels
@@ -319,6 +328,38 @@ class FlaubertModelTester(object):
         self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
         self.check_loss_output(result)
 
+    def create_and_check_flaubert_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = FlaubertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        loss, logits = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
+        self.check_loss_output(result)
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -329,6 +370,7 @@ class FlaubertModelTester(object):
             sequence_labels,
             token_labels,
             is_impossible_labels,
+            choice_labels,
             input_mask,
         ) = config_and_inputs
         inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
@@ -346,6 +388,7 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
             FlaubertForQuestionAnsweringSimple,
             FlaubertForSequenceClassification,
             FlaubertForTokenClassification,
+            FlaubertForMultipleChoice,
         )
         if is_torch_available()
         else ()
@@ -382,6 +425,10 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_flaubert_token_classif(*config_and_inputs)
 
+    def test_flaubert_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_multiple_choice(*config_and_inputs)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 839c064209..88bfaa63cd 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -80,8 +80,8 @@ class TFModelTesterMixin:
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
             inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices, 1))
-                if isinstance(v, tf.Tensor) and v.ndim != 0
+                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+                if isinstance(v, tf.Tensor) and v.ndim > 0
                 else v
                 for k, v in inputs_dict.items()
             }
diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py
index 1b3e6d8823..399c78ca53 100644
--- a/tests/test_modeling_tf_flaubert.py
+++ b/tests/test_modeling_tf_flaubert.py
@@ -18,11 +18,340 @@ import unittest
 from transformers import is_tf_available
 from transformers.testing_utils import require_tf, slow
 
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
 
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
-    from transformers import TFFlaubertModel
+
+    from transformers import (
+        FlaubertConfig,
+        TFFlaubertModel,
+        TFFlaubertWithLMHeadModel,
+        TFFlaubertForSequenceClassification,
+        TFFlaubertForQuestionAnsweringSimple,
+        TFFlaubertForTokenClassification,
+        TFFlaubertForMultipleChoice,
+        TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+class TFFlaubertModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_lengths = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.gelu_activation = True
+        self.sinusoidal_embeddings = False
+        self.causal = False
+        self.asm = False
+        self.n_langs = 2
+        self.vocab_size = 99
+        self.n_special = 0
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.summary_type = "last"
+        self.use_proj = True
+        self.scope = None
+        self.bos_token_id = 0
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+
+        input_lengths = None
+        if self.use_input_lengths:
+            input_lengths = (
+                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+            )  # small variation of seq_length
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+        sequence_labels = None
+        token_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = FlaubertConfig(
+            vocab_size=self.vocab_size,
+            n_special=self.n_special,
+            emb_dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            gelu_activation=self.gelu_activation,
+            sinusoidal_embeddings=self.sinusoidal_embeddings,
+            asm=self.asm,
+            causal=self.causal,
+            n_langs=self.n_langs,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            summary_type=self.summary_type,
+            use_proj=self.use_proj,
+            bos_token_id=self.bos_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        )
+
+    def create_and_check_flaubert_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertModel(config=config)
+        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
+        outputs = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        outputs = model(inputs)
+        sequence_output = outputs[0]
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_flaubert_lm_head(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertWithLMHeadModel(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
+        outputs = model(inputs)
+
+        logits = outputs[0]
+
+        result = {
+            "logits": logits.numpy(),
+        }
+
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
+
+    def create_and_check_flaubert_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertForQuestionAnsweringSimple(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths}
+
+        start_logits, end_logits = model(inputs)
+
+        result = {
+            "start_logits": start_logits.numpy(),
+            "end_logits": end_logits.numpy(),
+        }
+
+        self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
+
+    def create_and_check_flaubert_sequence_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertForSequenceClassification(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths}
+
+        (logits,) = model(inputs)
+
+        result = {
+            "logits": logits.numpy(),
+        }
+
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
+
+    def create_and_check_flaubert_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFFlaubertForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
+    def create_and_check_flaubert_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = TFFlaubertForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        (logits,) = model(inputs)
+        result = {"logits": logits.numpy()}
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "langs": token_type_ids,
+            "lengths": input_lengths,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFFlaubertModel,
+            TFFlaubertWithLMHeadModel,
+            TFFlaubertForSequenceClassification,
+            TFFlaubertForQuestionAnsweringSimple,
+            TFFlaubertForTokenClassification,
+            TFFlaubertForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (TFFlaubertWithLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
+
+    def setUp(self):
+        self.model_tester = TFFlaubertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_flaubert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
+
+    def test_flaubert_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs)
+
+    def test_flaubert_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_qa(*config_and_inputs)
+
+    def test_flaubert_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFFlaubertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
 
 
 @require_tf
diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py
index 26cdb0a39c..1903f4a8df 100644
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -32,6 +32,7 @@ if is_tf_available():
         TFXLMForSequenceClassification,
         TFXLMForQuestionAnsweringSimple,
         TFXLMForTokenClassification,
+        TFXLMForMultipleChoice,
         TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
@@ -91,6 +92,7 @@ class TFXLMModelTester:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
             is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
         config = XLMConfig(
             vocab_size=self.vocab_size,
@@ -120,6 +122,7 @@ class TFXLMModelTester:
             sequence_labels,
             token_labels,
             is_impossible_labels,
+            choice_labels,
             input_mask,
         )
 
@@ -132,6 +135,7 @@ class TFXLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = TFXLMModel(config=config)
@@ -157,6 +161,7 @@ class TFXLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = TFXLMWithLMHeadModel(config)
@@ -181,6 +186,7 @@ class TFXLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = TFXLMForQuestionAnsweringSimple(config)
@@ -206,6 +212,7 @@ class TFXLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = TFXLMForSequenceClassification(config)
@@ -229,6 +236,7 @@ class TFXLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         config.num_labels = self.num_labels
@@ -240,6 +248,32 @@ class TFXLMModelTester:
         }
         self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
 
+    def create_and_check_xlm_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = TFXLMForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        (logits,) = model(inputs)
+        result = {"logits": logits.numpy()}
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -250,6 +284,7 @@ class TFXLMModelTester:
             sequence_labels,
             token_labels,
             is_impossible_labels,
+            choice_labels,
             input_mask,
         ) = config_and_inputs
         inputs_dict = {
@@ -265,13 +300,13 @@ class TFXLMModelTester:
 class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        # TODO The multiple choice model is missing and should be added.
         (
             TFXLMModel,
             TFXLMWithLMHeadModel,
             TFXLMForSequenceClassification,
             TFXLMForQuestionAnsweringSimple,
             TFXLMForTokenClassification,
+            TFXLMForMultipleChoice,
         )
         if is_tf_available()
         else ()
@@ -307,6 +342,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
 
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py
index 2a5cd4096a..efa9346cee 100644
--- a/tests/test_modeling_xlm.py
+++ b/tests/test_modeling_xlm.py
@@ -33,6 +33,7 @@ if is_torch_available():
         XLMForQuestionAnswering,
         XLMForSequenceClassification,
         XLMForQuestionAnsweringSimple,
+        XLMForMultipleChoice,
     )
     from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -63,7 +64,7 @@ class XLMModelTester:
         self.max_position_embeddings = 512
         self.type_sequence_label_size = 2
         self.initializer_range = 0.02
-        self.num_labels = 3
+        self.num_labels = 2
         self.num_choices = 4
         self.summary_type = "last"
         self.use_proj = True
@@ -91,6 +92,7 @@ class XLMModelTester:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
             is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
         config = XLMConfig(
             vocab_size=self.vocab_size,
@@ -109,6 +111,7 @@ class XLMModelTester:
             initializer_range=self.initializer_range,
             summary_type=self.summary_type,
             use_proj=self.use_proj,
+            num_labels=self.num_labels,
             bos_token_id=self.bos_token_id,
         )
 
@@ -120,6 +123,7 @@ class XLMModelTester:
             sequence_labels,
             token_labels,
             is_impossible_labels,
+            choice_labels,
             input_mask,
         )
 
@@ -135,6 +139,7 @@ class XLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = XLMModel(config=config)
@@ -160,6 +165,7 @@ class XLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = XLMWithLMHeadModel(config)
@@ -185,6 +191,7 @@ class XLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = XLMForQuestionAnsweringSimple(config)
@@ -214,6 +221,7 @@ class XLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = XLMForQuestionAnswering(config)
@@ -280,6 +288,7 @@ class XLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         model = XLMForSequenceClassification(config)
@@ -306,6 +315,7 @@ class XLMModelTester:
         sequence_labels,
         token_labels,
         is_impossible_labels,
+        choice_labels,
         input_mask,
     ):
         config.num_labels = self.num_labels
@@ -321,6 +331,38 @@ class XLMModelTester:
         self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
         self.check_loss_output(result)
 
+    def create_and_check_xlm_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = XLMForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        loss, logits = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
+        self.check_loss_output(result)
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -331,6 +373,7 @@ class XLMModelTester:
             sequence_labels,
             token_labels,
             is_impossible_labels,
+            choice_labels,
             input_mask,
         ) = config_and_inputs
         inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
@@ -348,6 +391,7 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase):
             XLMForSequenceClassification,
             XLMForQuestionAnsweringSimple,
             XLMForTokenClassification,
+            XLMForMultipleChoice,
         )
         if is_torch_available()
         else ()
@@ -387,6 +431,10 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_token_classif(*config_and_inputs)
 
+    def test_xlm_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: