Adds translation pipeline (#3419)

* fix merge conflicts * add t5 summarization example * change parameters for t5 summarization * make style * add first code snippet for translation * only add prefixes * add prefix patterns * make style * renaming * fix conflicts * remove unused patterns * solve conflicts * fix merge conflicts * remove translation example * remove summarization example * make sure tensors are in numpy for float comparsion * re-add t5 config * fix t5 import config typo * make style * remove unused numpy statements * update doctstring * import translation pipeline
2020-03-26 13:50:58 +01:00
parent 3c5c567507
commit 022e8fab97
3 changed files with 187 additions and 10 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -116,6 +116,7 @@ from .pipelines import (
    SummarizationPipeline,
    TextClassificationPipeline,
    TokenClassificationPipeline,
+    TranslationPipeline,
    pipeline,
 )
 from .tokenization_albert import AlbertTokenizer
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -130,7 +130,9 @@ class PipelineDataFormat:

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]

-    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+    def __init__(
+        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+    ):
        self.output_path = output_path
        self.input_path = input_path
        self.column = column.split(",") if column is not None else [""]
@@ -176,7 +178,7 @@ class PipelineDataFormat:

    @staticmethod
    def from_str(
-        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False
+        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
        if format == "json":
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
@@ -189,7 +191,9 @@ class PipelineDataFormat:


 class CsvPipelineDataFormat(PipelineDataFormat):
-    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+    def __init__(
+        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+    ):
        super().__init__(output_path, input_path, column, overwrite=overwrite)

    def __iter__(self):
@@ -210,7 +214,9 @@ class CsvPipelineDataFormat(PipelineDataFormat):


 class JsonPipelineDataFormat(PipelineDataFormat):
-    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+    def __init__(
+        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+    ):
        super().__init__(output_path, input_path, column, overwrite=overwrite)

        with open(input_path, "r") as f:
@@ -1120,7 +1126,11 @@ class QuestionAnsweringPipeline(Pipeline):
            chars_idx += len(word) + 1

        # Join text with spaces
-        return {"answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx)}
+        return {
+            "answer": " ".join(words),
+            "start": max(0, char_start_idx),
+            "end": min(len(text), char_end_idx),
+        }


 class SummarizationPipeline(Pipeline):
@@ -1223,18 +1233,18 @@ class SummarizationPipeline(Pipeline):
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]
            elif self.framework == "tf":
-                input_length = tf.shape(inputs["input_ids"])[-1]
+                input_length = tf.shape(inputs["input_ids"])[-1].numpy()

            if input_length < self.model.config.min_length // 2:
                logger.warning(
-                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length in config and insert config manually".format(
+                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
                        self.model.config.min_length, input_length
                    )
                )

            if input_length < self.model.config.max_length:
                logger.warning(
-                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length in config and insert config manually".format(
+                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
                        self.model.config.max_length, input_length
                    )
                )
@@ -1250,7 +1260,115 @@ class SummarizationPipeline(Pipeline):
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
-                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces
+                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                    )
+                results.append(record)
+            return results
+
+
+class TranslationPipeline(Pipeline):
+    """
+    Translates from one language to another.
+
+    Usage::
+        en_fr_translator = pipeline("translation_en_to_fr")
+        en_fr_translator("How old are you?")
+
+    Supported Models: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"
+
+    Arguments:
+        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
+            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
+            checkpoint identifier or an actual pre-trained model inheriting from
+            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            TensorFlow.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
+            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
+            :class:`~transformers.PreTrainedTokenizer`.
+            If :obj:`None`, the default of the pipeline will be loaded.
+        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
+            Model card attributed to the model for this pipeline.
+        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
+            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
+            installed.
+            If no framework is specified, will default to the one currently installed. If no framework is specified
+            and both frameworks are installed, will default to PyTorch.
+        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
+    """
+
+    def __call__(
+        self, *texts, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
+    ):
+        r"""
+        Args:
+            *texts: (list of strings) articles to be summarized
+            return_text: (bool, default=True) whether to add a decoded "translation_text" to each result
+            return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result
+
+            **generate_kwargs: extra kwargs passed to `self.model.generate`_
+
+        Returns:
+            list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate
+        .. _`self.model.generate`:
+            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate
+        """
+        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
+
+        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
+
+        if isinstance(texts[0], list):
+            assert (
+                self.tokenizer.pad_token_id is not None
+            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
+            texts = ([prefix + text for text in texts[0]],)
+            pad_to_max_length = True
+
+        elif isinstance(texts[0], str):
+            texts = (prefix + texts[0],)
+            pad_to_max_length = False
+        else:
+            raise ValueError(
+                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
+                    texts[0]
+                )
+            )
+
+        with self.device_placement():
+            inputs = self._parse_and_tokenize(*texts, pad_to_max_length=pad_to_max_length)
+
+            if self.framework == "pt":
+                inputs = self.ensure_tensor_on_device(**inputs)
+                input_length = inputs["input_ids"].shape[-1]
+
+            elif self.framework == "tf":
+                input_length = tf.shape(inputs["input_ids"])[-1].numpy()
+
+            if input_length > 0.9 * self.model.config.max_length:
+                logger.warning(
+                    "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
+                        input_length, self.model.config.max_length
+                    )
+                )
+
+            translations = self.model.generate(
+                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+            )
+            results = []
+            for translation in translations:
+                record = {}
+                if return_tensors:
+                    record["translation_token_ids"] = translation
+                if return_text:
+                    record["translation_text"] = self.tokenizer.decode(
+                        translation,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results
@@ -1324,6 +1442,36 @@ SUPPORTED_TASKS = {
            "tokenizer": ("bart-large-cnn", {"use_fast": False}),
        },
    },
+    "translation_en_to_fr": {
+        "impl": TranslationPipeline,
+        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
+        "pt": AutoModelWithLMHead if is_torch_available() else None,
+        "default": {
+            "model": {"pt": "t5-base", "tf": "t5-base"},
+            "config": None,
+            "tokenizer": ("t5-base", {"use_fast": False}),
+        },
+    },
+    "translation_en_to_de": {
+        "impl": TranslationPipeline,
+        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
+        "pt": AutoModelWithLMHead if is_torch_available() else None,
+        "default": {
+            "model": {"pt": "t5-base", "tf": "t5-base"},
+            "config": None,
+            "tokenizer": ("t5-base", {"use_fast": False}),
+        },
+    },
+    "translation_en_to_ro": {
+        "impl": TranslationPipeline,
+        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
+        "pt": AutoModelWithLMHead if is_torch_available() else None,
+        "default": {
+            "model": {"pt": "t5-base", "tf": "t5-base"},
+            "config": None,
+            "tokenizer": ("t5-base", {"use_fast": False}),
+        },
+    },
 }


@@ -1472,4 +1620,4 @@ def pipeline(
            )
        model = model_class.from_pretrained(model, config=config, **model_kwargs)

-    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)
+    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs,)
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -81,6 +81,12 @@ TF_FILL_MASK_FINETUNED_MODELS = [
 SUMMARIZATION_FINETUNED_MODELS = {("bart-large-cnn", "bart-large-cnn"), ("t5-small", "t5-small")}
 TF_SUMMARIZATION_FINETUNED_MODELS = {("t5-small", "t5-small")}

+TRANSLATION_FINETUNED_MODELS = {
+    ("t5-small", "t5-small", "translation_en_to_de"),
+    ("t5-small", "t5-small", "translation_en_to_ro"),
+}
+TF_TRANSLATION_FINETUNED_MODELS = {("t5-small", "t5-small", "translation_en_to_fr")}
+

 class MonoColumnInputTestCase(unittest.TestCase):
    def _test_mono_column_pipeline(
@@ -272,6 +278,28 @@ class MonoColumnInputTestCase(unittest.TestCase):
                nlp, valid_inputs, invalid_inputs, mandatory_keys,
            )

+    @require_torch
+    def test_translation(self):
+        valid_inputs = ["A string like this", ["list of strings entry 1", "list of strings v2"]]
+        invalid_inputs = [4, "<mask>"]
+        mandatory_keys = ["translation_text"]
+        for model, tokenizer, task in TRANSLATION_FINETUNED_MODELS:
+            nlp = pipeline(task=task, model=model, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(
+                nlp, valid_inputs, invalid_inputs, mandatory_keys,
+            )
+
+    @require_tf
+    def test_tf_translation(self):
+        valid_inputs = ["A string like this", ["list of strings entry 1", "list of strings v2"]]
+        invalid_inputs = [4, "<mask>"]
+        mandatory_keys = ["translation_text"]
+        for model, tokenizer, task in TF_TRANSLATION_FINETUNED_MODELS:
+            nlp = pipeline(task=task, model=model, tokenizer=tokenizer, framework="tf")
+            self._test_mono_column_pipeline(
+                nlp, valid_inputs, invalid_inputs, mandatory_keys,
+            )
+

 class MultiColumnInputTestCase(unittest.TestCase):
    def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):