From 4e10acb3e59f5ef52f383e5a82987e672f17b1fd Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 Jun 2020 13:19:53 -0400
Subject: [PATCH] Add more models to common tests (#4910)

---
 src/transformers/modeling_distilbert.py |  2 +-
 src/transformers/modeling_electra.py    |  2 +-
 src/transformers/modeling_longformer.py |  7 ++++++-
 src/transformers/modeling_roberta.py    |  4 +++-
 tests/test_modeling_distilbert.py       |  8 +++++++-
 tests/test_modeling_electra.py          | 10 +++++++++-
 tests/test_modeling_longformer.py       | 14 +++++++++++++-
 tests/test_modeling_roberta.py          | 17 +++++++++++++++--
 tests/test_modeling_xlnet.py            |  4 ++++
 9 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index 1713164498..522b5bef45 100644
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -848,7 +848,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + outputs[1:]  # add hidden states and attention if they are here
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             # Only keep active parts of the loss
diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py
index 48b78eb9c7..e85a57cddd 100644
--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -435,7 +435,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
         sequence_output = discriminator_hidden_states[0]
         logits = self.classifier(sequence_output)
 
-        outputs = (logits,) + discriminator_hidden_states[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + discriminator_hidden_states[1:]  # add hidden states and attention if they are here
 
         if labels is not None:
             if self.num_labels == 1:
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index 3b3456b676..aaf33b078d 100644
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -797,6 +797,8 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
         self.longformer = LongformerModel(config)
         self.classifier = LongformerClassificationHead(config)
 
+        self.init_weights()
+
     @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
@@ -861,6 +863,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
         )
         sequence_output = outputs[0]
         logits = self.classifier(sequence_output)
@@ -919,7 +922,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
     @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
-        input_ids,
+        input_ids=None,
         attention_mask=None,
         global_attention_mask=None,
         token_type_ids=None,
@@ -1099,6 +1102,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
         )
 
         sequence_output = outputs[0]
@@ -1228,6 +1232,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
             token_type_ids=flat_token_type_ids,
             attention_mask=flat_attention_mask,
             global_attention_mask=flat_global_attention_mask,
+            output_attentions=output_attentions,
         )
         pooled_output = outputs[1]
 
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index 8b4bb0ce31..91807c13ae 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -300,6 +300,8 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
         self.roberta = RobertaModel(config)
         self.classifier = RobertaClassificationHead(config)
 
+        self.init_weights()
+
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
@@ -618,7 +620,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
-        input_ids,
+        input_ids=None,
         attention_mask=None,
         token_type_ids=None,
         position_ids=None,
diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py
index a90288495b..a5b9024ac6 100644
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -38,7 +38,13 @@ if is_torch_available():
 class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+        (
+            DistilBertModel,
+            DistilBertForMaskedLM,
+            DistilBertForQuestionAnswering,
+            DistilBertForSequenceClassification,
+            DistilBertForTokenClassification,
+        )
         if is_torch_available()
         else None
     )
diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py
index 22254a81a1..9c0a676d26 100644
--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -39,7 +39,15 @@ if is_torch_available():
 class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        (ElectraModel, ElectraForMaskedLM, ElectraForTokenClassification,) if is_torch_available() else ()
+        (
+            ElectraModel,
+            ElectraForPreTraining,
+            ElectraForMaskedLM,
+            ElectraForTokenClassification,
+            ElectraForSequenceClassification,
+        )
+        if is_torch_available()
+        else ()
     )
 
     class ElectraModelTester(object):
diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py
index 7f4f993ea2..d1c7beca94 100644
--- a/tests/test_modeling_longformer.py
+++ b/tests/test_modeling_longformer.py
@@ -296,7 +296,19 @@ class LongformerModelTest(ModelTesterMixin, unittest.TestCase):
     test_headmasking = False  # head masking is not supported
     test_torchscript = False
 
-    all_model_classes = (LongformerModel, LongformerForMaskedLM,) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            LongformerModel,
+            LongformerForMaskedLM,
+            # TODO: make tests pass for those models
+            # LongformerForSequenceClassification,
+            # LongformerForQuestionAnswering,
+            # LongformerForTokenClassification,
+            # LongformerForMultipleChoice,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     def setUp(self):
         self.model_tester = LongformerModelTester(self)
diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py
index ac9e9396dd..dbfb6e6ba4 100644
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -29,10 +29,12 @@ if is_torch_available():
         RobertaConfig,
         RobertaModel,
         RobertaForMaskedLM,
+        RobertaForMultipleChoice,
+        RobertaForQuestionAnswering,
         RobertaForSequenceClassification,
         RobertaForTokenClassification,
     )
-    from transformers.modeling_roberta import RobertaEmbeddings, RobertaForMultipleChoice, RobertaForQuestionAnswering
+    from transformers.modeling_roberta import RobertaEmbeddings
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
     from transformers.modeling_utils import create_position_ids_from_input_ids
 
@@ -40,7 +42,18 @@ if is_torch_available():
 @require_torch
 class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
 
-    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            RobertaForMaskedLM,
+            RobertaModel,
+            RobertaForSequenceClassification,
+            RobertaForTokenClassification,
+            RobertaForMultipleChoice,
+            RobertaForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     class RobertaModelTester(object):
         def __init__(
diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py
index 1e53393916..fccee2d6a4 100644
--- a/tests/test_modeling_xlnet.py
+++ b/tests/test_modeling_xlnet.py
@@ -31,6 +31,7 @@ if is_torch_available():
         XLNetConfig,
         XLNetModel,
         XLNetLMHeadModel,
+        XLNetForMultipleChoice,
         XLNetForSequenceClassification,
         XLNetForTokenClassification,
         XLNetForQuestionAnswering,
@@ -48,6 +49,7 @@ class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
             XLNetForTokenClassification,
             XLNetForSequenceClassification,
             XLNetForQuestionAnswering,
+            XLNetForMultipleChoice,
         )
         if is_torch_available()
         else ()
@@ -84,6 +86,7 @@ class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
             bos_token_id=1,
             eos_token_id=2,
             pad_token_id=5,
+            num_choices=4,
         ):
             self.parent = parent
             self.batch_size = batch_size
@@ -110,6 +113,7 @@ class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
             self.bos_token_id = bos_token_id
             self.pad_token_id = pad_token_id
             self.eos_token_id = eos_token_id
+            self.num_choices = num_choices
 
         def prepare_config_and_inputs(self):
             input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)