From 879fe8fa75e662ffd85a567a98522bc9cffe0c6c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 26 Aug 2021 14:47:11 +0200
Subject: [PATCH] Moving `summarization` pipeline to new testing format.
 (#13279)

* Moving `summarization` pipeline to new testing format.

* Remove generate_kwargs from __init__ args.
---
 .../pipelines/text2text_generation.py         |   3 +-
 tests/test_pipelines_summarization.py         | 124 ++++++++----------
 2 files changed, 59 insertions(+), 68 deletions(-)

diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 346f178bbc..a957a1f395 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -110,6 +110,7 @@ class Text2TextGenerationPipeline(Pipeline):
             - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
               -- The token ids of the generated text.
         """
+
         assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
 
         with self.device_placement():
@@ -267,7 +268,7 @@ class TranslationPipeline(Text2TextGenerationPipeline):
     def _parse_and_tokenize(self, *args, src_lang, tgt_lang, truncation):
         if getattr(self.tokenizer, "_build_translation_inputs", None):
             return self.tokenizer._build_translation_inputs(
-                *args, src_lang=src_lang, tgt_lang=tgt_lang, truncation=truncation
+                *args, return_tensors=self.framework, src_lang=src_lang, tgt_lang=tgt_lang, truncation=truncation
             )
         else:
             return super()._parse_and_tokenize(*args, truncation=truncation)
diff --git a/tests/test_pipelines_summarization.py b/tests/test_pipelines_summarization.py
index 6bc55e9915..f3f77410c7 100644
--- a/tests/test_pipelines_summarization.py
+++ b/tests/test_pipelines_summarization.py
@@ -14,84 +14,74 @@
 
 import unittest
 
-from transformers import AutoTokenizer, is_torch_available, pipeline
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers import (
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    LEDConfig,
+    SummarizationPipeline,
+    T5Config,
+    pipeline,
+)
+from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, slow, torch_device
 from transformers.tokenization_utils import TruncationStrategy
 
-from .test_pipelines_common import MonoInputPipelineCommonMixin
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
 
 
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers.models.bart import BartConfig, BartForConditionalGeneration
-
 DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
 
 
-class SimpleSummarizationPipelineTests(unittest.TestCase):
-    @require_torch
-    def test_input_too_long(self):
-        torch.manual_seed(0)
-        config = BartConfig(
-            vocab_size=257,
-            d_model=32,
-            encoder_layers=1,
-            decoder_layers=1,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            # So any text > 4 should raise an exception
-            max_position_embeddings=4,
-            encoder_attention_heads=1,
-            decoder_attention_heads=1,
-            max_length=4,
-            min_length=1,
-            forced_eos_token_id=None,
+@is_pipeline_test
+class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+    tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+
+    def run_pipeline_test(self, model, tokenizer, feature_extractor):
+        summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
+
+        outputs = summarizer("(CNN)The Palestinian Authority officially became")
+        self.assertEqual(outputs, [{"summary_text": ANY(str)}])
+
+        outputs = summarizer(
+            "(CNN)The Palestinian Authority officially became ",
+            num_beams=2,
+            min_length=2,
+            max_length=5,
         )
-        model = BartForConditionalGeneration(config)
-        # Bias output towards L
-        V, C = model.lm_head.weight.shape
+        self.assertEqual(outputs, [{"summary_text": ANY(str)}])
 
-        bias = torch.zeros(V)
-        bias[76] = 10
+        if not isinstance(model.config, (T5Config, LEDConfig)):
+            # LED, T5 can handle it.
+            # Too long.
+            with self.assertRaises(Exception):
+                outputs = summarizer("This " * 1000)
+        outputs = summarizer("This " * 1000, truncation=TruncationStrategy.ONLY_FIRST)
 
-        model.lm_head.bias = nn.Parameter(bias)
+    @require_torch
+    def test_small_model_pt(self):
+        summarizer = pipeline(task="summarization", model="sshleifer/tiny-mbart", framework="pt")
+        outputs = summarizer("This is a small test")
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "summary_text": "เข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไป"
+                }
+            ],
+        )
 
-        # # Generated with:
-        # import tempfile
-        # from tokenizers import Tokenizer, models
-        # from transformers import PreTrainedTokenizerFast
-        # model_max_length = 4
-        # vocab = [(chr(i), i) for i in range(256)]
-        # tokenizer = Tokenizer(models.Unigram(vocab))
-        # with tempfile.NamedTemporaryFile() as f:
-        #     tokenizer.save(f.name)
-        #     real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, model_max_length=model_max_length)
-        # real_tokenizer._tokenizer.save("tokenizer.json")
-        # # + add missing config.json with albert as model_type
-        tokenizer = AutoTokenizer.from_pretrained("Narsil/small_summarization_test")
-        summarizer = pipeline(task="summarization", model=model, tokenizer=tokenizer)
-
-        with self.assertLogs("transformers", level="WARNING"):
-            with self.assertRaises(IndexError):
-                _ = summarizer("This is a test")
-
-        output = summarizer("This is a test", truncation=TruncationStrategy.ONLY_FIRST)
-        # 2 is default BOS from Bart.
-        self.assertEqual(output, [{"summary_text": "\x02 L L L"}])
-
-
-class SummarizationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
-    pipeline_task = "summarization"
-    pipeline_running_kwargs = {"num_beams": 2, "min_length": 2, "max_length": 5}
-    small_models = [
-        "patrickvonplaten/t5-tiny-random",
-        "sshleifer/bart-tiny-random",
-    ]  # Models tested without the @slow decorator
-    large_models = []  # Models tested with the @slow decorator
-    invalid_inputs = [4, "<mask>"]
-    mandatory_keys = ["summary_text"]
+    @require_tf
+    def test_small_model_tf(self):
+        summarizer = pipeline(task="summarization", model="sshleifer/tiny-mbart", framework="tf")
+        outputs = summarizer("This is a small test")
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "summary_text": "เข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไปเข้าไป"
+                }
+            ],
+        )
 
     @require_torch
     @slow