From 9c683ef01e19c4dc1216dcd1ae3c8e7c44d7b2b9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 26 Mar 2020 11:03:13 +0100
Subject: [PATCH] Add t5 to pipeline(task='summarization') (#3413)

* solve conflicts

* move warnings below

* incorporate changes

* add pad_to_max_length to pipelines

* add bug fix for T5 beam search

* add prefix patterns

* make style

* fix conflicts

* adapt pipelines for task specific parameters

* improve docstring

* remove unused patterns
---
 src/transformers/configuration_utils.py |  11 +++
 src/transformers/modeling_tf_utils.py   |   4 +-
 src/transformers/pipelines.py           | 124 +++++++++++++++++-------
 tests/test_pipelines.py                 |  23 ++++-
 4 files changed, 120 insertions(+), 42 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index d8341a17e8..01f6b6554a 100644
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -380,3 +380,14 @@ class PretrainedConfig(object):
         """
         with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string())
+
+    def update(self, config_dict: Dict):
+        """
+        Updates attributes of this class
+        with attributes from `config_dict`.
+
+        Args:
+            :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 6a441c9fe1..e4e5449b1a 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -999,10 +999,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
             # set eos token prob to zero if min_length is not reached
             if eos_token_id is not None and cur_len < min_length:
                 # create eos_token_id boolean mask
+                num_batch_hypotheses = batch_size * num_beams
+
                 is_token_logit_eos_token = tf.convert_to_tensor(
                     [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
                 )
-                eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size])
+                eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size])
 
                 scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf"))
 
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 446cd2d57d..263c2bbb82 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -31,6 +31,7 @@ from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
 from .configuration_bart import BartConfig
 from .configuration_distilbert import DistilBertConfig
 from .configuration_roberta import RobertaConfig
+from .configuration_t5 import T5Config
 from .configuration_utils import PretrainedConfig
 from .configuration_xlm import XLMConfig
 from .data import SquadExample, squad_convert_examples_to_features
@@ -60,7 +61,6 @@ if is_torch_available():
         AutoModelForTokenClassification,
         AutoModelWithLMHead,
     )
-    from .modeling_bart import BartForConditionalGeneration
 
 
 logger = logging.getLogger(__name__)
@@ -336,6 +336,7 @@ class Pipeline(_ScikitCompat):
         tokenizer: PreTrainedTokenizer,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
+        task: str = "",
         args_parser: ArgumentHandler = None,
         device: int = -1,
         binary_output: bool = False,
@@ -356,6 +357,11 @@ class Pipeline(_ScikitCompat):
         if self.framework == "pt" and self.device.type == "cuda":
             self.model = self.model.to(self.device)
 
+        # Update config with task specific parameters
+        task_specific_params = self.model.config.task_specific_params
+        if task_specific_params is not None and task in task_specific_params:
+            self.model.config.update(task_specific_params.get(task))
+
     def save_pretrained(self, save_directory):
         """
         Save the pipeline's model and tokenizer to the specified save_directory
@@ -420,7 +426,7 @@ class Pipeline(_ScikitCompat):
         """
         args = ["input_ids", "attention_mask"]
 
-        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig, BartConfig)):
+        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig, BartConfig, T5Config)):
             args += ["token_type_ids"]
 
         # PR #1548 (CLI) There is an issue with attention_mask
@@ -432,14 +438,18 @@ class Pipeline(_ScikitCompat):
         else:
             return {k: [feature[k] for feature in features] for k in args}
 
-    def _parse_and_tokenize(self, *texts, **kwargs):
+    def _parse_and_tokenize(self, *texts, pad_to_max_length=False, **kwargs):
         """
         Parse arguments and tokenize
         """
         # Parse arguments
         inputs = self._args_parser(*texts, **kwargs)
         inputs = self.tokenizer.batch_encode_plus(
-            inputs, add_special_tokens=True, return_tensors=self.framework, max_length=self.tokenizer.max_len
+            inputs,
+            add_special_tokens=True,
+            return_tensors=self.framework,
+            max_length=self.tokenizer.max_len,
+            pad_to_max_length=pad_to_max_length,
         )
 
         # Filter out features not available on specific models
@@ -520,6 +530,7 @@ class FeatureExtractionPipeline(Pipeline):
         framework: Optional[str] = None,
         args_parser: ArgumentHandler = None,
         device: int = -1,
+        task: str = "",
     ):
         super().__init__(
             model=model,
@@ -529,6 +540,7 @@ class FeatureExtractionPipeline(Pipeline):
             args_parser=args_parser,
             device=device,
             binary_output=True,
+            task=task,
         )
 
     def __call__(self, *args, **kwargs):
@@ -625,6 +637,7 @@ class FillMaskPipeline(Pipeline):
         args_parser: ArgumentHandler = None,
         device: int = -1,
         topk=5,
+        task: str = "",
     ):
         super().__init__(
             model=model,
@@ -634,6 +647,7 @@ class FillMaskPipeline(Pipeline):
             args_parser=args_parser,
             device=device,
             binary_output=True,
+            task=task,
         )
 
         self.topk = topk
@@ -725,6 +739,7 @@ class NerPipeline(Pipeline):
         device: int = -1,
         binary_output: bool = False,
         ignore_labels=["O"],
+        task: str = "",
     ):
         super().__init__(
             model=model,
@@ -734,6 +749,7 @@ class NerPipeline(Pipeline):
             args_parser=args_parser,
             device=device,
             binary_output=binary_output,
+            task=task,
         )
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
@@ -896,6 +912,7 @@ class QuestionAnsweringPipeline(Pipeline):
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
         device: int = -1,
+        task: str = "",
         **kwargs
     ):
         super().__init__(
@@ -905,6 +922,7 @@ class QuestionAnsweringPipeline(Pipeline):
             framework=framework,
             args_parser=QuestionAnsweringArgumentHandler(),
             device=device,
+            task=task,
             **kwargs,
         )
 
@@ -1111,12 +1129,16 @@ class SummarizationPipeline(Pipeline):
 
     Usage::
 
+        # use bart in pytorch
         summarizer = pipeline("summarization")
-        summarizer("Sam Shleifer writes the best docstring examples in the whole world.")
+        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
+
+        # use t5 in tf
+        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
+        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
 
     Supported Models:
-        The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
-        currently only ``BartForConditionalGeneration.from_pretrained('bart-large-cnn')``
+        The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
 
     Arguments:
         model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
@@ -1147,17 +1169,8 @@ class SummarizationPipeline(Pipeline):
             on the associated CUDA device id.
     """
 
-    task = "summarization"
-
     def __call__(
-        self,
-        *documents,
-        return_tensors=False,
-        return_text=True,
-        max_length=142,
-        min_length=21,
-        clean_up_tokenization_spaces=False,
-        **generate_kwargs
+        self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
     ):
         r"""
         Args:
@@ -1165,10 +1178,6 @@ class SummarizationPipeline(Pipeline):
             return_text: (bool, default=True) whether to add a decoded "summary_text" to each result
             return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result
 
-            max_length: (`optional`) int
-                The max length of the sequence to be generated. Does not include tokens in input_ids.
-            min_len: (`optional`) int
-            no_repeat_ngram_size:  (`optional`) int. ban ngrams of this length from being repeated in the generated text
             clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output
             **generate_kwargs: extra kwargs passed to `self.model.generate`_
 
@@ -1180,19 +1189,60 @@ class SummarizationPipeline(Pipeline):
 
         """
         assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
-        if self.framework == "tf":
-            raise NotImplementedError("Tensorflow not supported")
-        with self.device_placement():
-            inputs = self._parse_and_tokenize(*documents)
-            inputs = self.ensure_tensor_on_device(**inputs)
-            summaries = self.model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=False,
-                **generate_kwargs,
+        assert len(documents) > 0, "Please provide a document to summarize"
+
+        if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__:
+            raise NotImplementedError(
+                "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`"
             )
+
+        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
+
+        if isinstance(documents[0], list):
+            assert (
+                self.tokenizer.pad_token_id is not None
+            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
+
+            documents = ([prefix + document for document in documents[0]],)
+            pad_to_max_length = True
+
+        elif isinstance(documents[0], str):
+            documents = (prefix + documents[0],)
+            pad_to_max_length = False
+        else:
+            raise ValueError(
+                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
+                    documents[0]
+                )
+            )
+
+        with self.device_placement():
+            inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length)
+
+            if self.framework == "pt":
+                inputs = self.ensure_tensor_on_device(**inputs)
+                input_length = inputs["input_ids"].shape[-1]
+            elif self.framework == "tf":
+                input_length = tf.shape(inputs["input_ids"])[-1]
+
+            if input_length < self.model.config.min_length // 2:
+                logger.warning(
+                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length in config and insert config manually".format(
+                        self.model.config.min_length, input_length
+                    )
+                )
+
+            if input_length < self.model.config.max_length:
+                logger.warning(
+                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length in config and insert config manually".format(
+                        self.model.config.max_length, input_length
+                    )
+                )
+
+            summaries = self.model.generate(
+                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+            )
+
             results = []
             for summary in summaries:
                 record = {}
@@ -1266,8 +1316,8 @@ SUPPORTED_TASKS = {
     },
     "summarization": {
         "impl": SummarizationPipeline,
-        "pt": BartForConditionalGeneration if is_torch_available() else None,
-        "tf": None,
+        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
+        "pt": AutoModelWithLMHead if is_torch_available() else None,
         "default": {
             "model": {"pt": "bart-large-cnn", "tf": None},
             "config": None,
@@ -1361,7 +1411,7 @@ def pipeline(
     framework = framework or get_framework(model)
 
     targeted_task = SUPPORTED_TASKS[task]
-    task, model_class = targeted_task["impl"], targeted_task[framework]
+    task_class, model_class = targeted_task["impl"], targeted_task[framework]
 
     # Use default model/config/tokenizer for the task if no model is provided
     if model is None:
@@ -1422,4 +1472,4 @@ def pipeline(
             )
         model = model_class.from_pretrained(model, config=config, **model_kwargs)
 
-    return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
+    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index fb5e78ae9a..25d82d2adb 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -78,6 +78,9 @@ TF_FILL_MASK_FINETUNED_MODELS = [
     (("distilroberta-base", {"use_fast": False}), "distilroberta-base", None),
 ]
 
+SUMMARIZATION_FINETUNED_MODELS = {("bart-large-cnn", "bart-large-cnn"), ("t5-small", "t5-small")}
+TF_SUMMARIZATION_FINETUNED_MODELS = {("t5-small", "t5-small")}
+
 
 class MonoColumnInputTestCase(unittest.TestCase):
     def _test_mono_column_pipeline(
@@ -252,10 +255,22 @@ class MonoColumnInputTestCase(unittest.TestCase):
         valid_inputs = ["A string like this", ["list of strings entry 1", "list of strings v2"]]
         invalid_inputs = [4, "<mask>"]
         mandatory_keys = ["summary_text"]
-        nlp = pipeline(task="summarization")
-        self._test_mono_column_pipeline(
-            nlp, valid_inputs, invalid_inputs, mandatory_keys,
-        )
+        for model, tokenizer in SUMMARIZATION_FINETUNED_MODELS:
+            nlp = pipeline(task="summarization", model=model, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(
+                nlp, valid_inputs, invalid_inputs, mandatory_keys,
+            )
+
+    @require_tf
+    def test_tf_summarization(self):
+        valid_inputs = ["A string like this", ["list of strings entry 1", "list of strings v2"]]
+        invalid_inputs = [4, "<mask>"]
+        mandatory_keys = ["summary_text"]
+        for model, tokenizer in TF_SUMMARIZATION_FINETUNED_MODELS:
+            nlp = pipeline(task="summarization", model=model, tokenizer=tokenizer, framework="tf")
+            self._test_mono_column_pipeline(
+                nlp, valid_inputs, invalid_inputs, mandatory_keys,
+            )
 
 
 class MultiColumnInputTestCase(unittest.TestCase):