Add Image To Text Generation pipeline (#18821)

* Add Image2TextGenerationPipeline to supported pipelines * Add Flax and Tensorflow support * Add Flax and Tensorflow small tests * Add default model for Tensorflow * Add docstring * Fix doc style * Add tiny models for pytorch and flax * Remove flax from pipeline. Fix tests * Use ydshieh/vit-gpt2-coco-en as a default for both PyTorch and Tensorflow * Fix Tensorflow support Co-authored-by: Olivier Dehaene <olivier@huggingface.co>
2022-09-01 18:07:14 +02:00
parent c61f116b63
commit ddb69e5af8
5 changed files with 292 additions and 1 deletions
--- a/tests/pipelines/test_pipelines_image2text_generation.py
+++ b/tests/pipelines/test_pipelines_image2text_generation.py
@@ -0,0 +1,171 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
+from transformers.pipelines import pipeline
+from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision, slow
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class Image2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
+    tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+        pipe = pipeline("image2text-generation", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+        examples = [
+            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": ANY(str)}],
+                [{"generated_text": ANY(str)}],
+            ],
+        )
+
+    @require_tf
+    def test_small_model_tf(self):
+        pipe = pipeline("image2text-generation", model="hf-internal-testing/tiny-random-vit-gpt2")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+
+        outputs = pipe(image)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "generated_text": (
+                        " intermedi intermedi intermedi intermedi intermedi "
+                        "explorer explorer explorer explorer explorer explorer "
+                        "explorer medicine medicine medicine medicine medicine "
+                        "medicine medicine"
+                    )
+                },
+            ],
+        )
+
+        outputs = pipe([image, image])
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {
+                        "generated_text": (
+                            " intermedi intermedi intermedi intermedi intermedi "
+                            "explorer explorer explorer explorer explorer explorer "
+                            "explorer medicine medicine medicine medicine medicine "
+                            "medicine medicine"
+                        )
+                    },
+                ],
+                [
+                    {
+                        "generated_text": (
+                            " intermedi intermedi intermedi intermedi intermedi "
+                            "explorer explorer explorer explorer explorer explorer "
+                            "explorer medicine medicine medicine medicine medicine "
+                            "medicine medicine"
+                        )
+                    },
+                ],
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        pipe = pipeline("image2text-generation", model="hf-internal-testing/tiny-random-vit-gpt2")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+
+        outputs = pipe(image)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "generated_text": "growthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthGOGO"
+                },
+            ],
+        )
+
+        outputs = pipe([image, image])
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {
+                        "generated_text": "growthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthGOGO"
+                    }
+                ],
+                [
+                    {
+                        "generated_text": "growthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthGOGO"
+                    }
+                ],
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        pipe = pipeline("image2text-generation", model="ydshieh/vit-gpt2-coco-en")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+
+        outputs = pipe(image)
+        self.assertEqual(outputs, [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}])
+
+        outputs = pipe([image, image])
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}],
+                [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}],
+            ],
+        )
+
+    @slow
+    @require_tf
+    def test_large_model_tf(self):
+        pipe = pipeline("image2text-generation", model="ydshieh/vit-gpt2-coco-en")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+
+        outputs = pipe(image)
+        self.assertEqual(outputs, [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}])
+
+        outputs = pipe([image, image])
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}],
+                [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}],
+            ],
+        )