From 66336dc18374cdba550759cc923c36217159d4c9 Mon Sep 17 00:00:00 2001
From: Sijun He <sijun.he@hotmail.com>
Date: Mon, 13 Jun 2022 19:49:44 +0800
Subject: [PATCH] Add Visual Question Answering (VQA) pipeline (#17286)

* wip

* rebase

* all tests pass

* rebase

* ready for PR

* address comments

* fix styles

* add require_torch to pipeline test

* remove remote image to improve CI consistency

* address comments; fix tf/flax tests

* address comments; fix tf/flax tests

* fix tests; add alias

* repo consistency tests

* Update src/transformers/pipelines/visual_question_answering.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* address comments

* Update src/transformers/pipelines/visual_question_answering.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* merge

* Update src/transformers/models/auto/modeling_auto.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* merge

Co-authored-by: Sijun He <sijunhe@Sijuns-MacBook-Pro.local>
Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/main_classes/pipelines.mdx     |   7 ++
 docs/source/en/model_doc/auto.mdx             |   4 +
 src/transformers/__init__.py                  |   6 +
 src/transformers/models/auto/__init__.py      |   4 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |  20 +++
 .../models/auto/tokenization_auto.py          |   1 +
 src/transformers/pipelines/__init__.py        |  17 +++
 .../pipelines/visual_question_answering.py    | 115 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  10 ++
 ...est_pipelines_visual_question_answering.py | 115 ++++++++++++++++++
 11 files changed, 300 insertions(+)
 create mode 100644 src/transformers/pipelines/visual_question_answering.py
 create mode 100644 tests/pipelines/test_pipelines_visual_question_answering.py

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index af82d16750..1dc76e67cd 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -38,6 +38,7 @@ There are two categories of pipeline abstractions to be aware about:
   - [`Text2TextGenerationPipeline`]
   - [`TokenClassificationPipeline`]
   - [`TranslationPipeline`]
+  - [`VisualQuestionAnsweringPipeline`]
   - [`ZeroShotClassificationPipeline`]
   - [`ZeroShotImageClassificationPipeline`]
 
@@ -423,6 +424,12 @@ See [`TokenClassificationPipeline`] for all details.
     - __call__
     - all
 
+### VisualQuestionAnsweringPipeline
+
+[[autodoc]] VisualQuestionAnsweringPipeline
+    - __call__
+    - all
+
 ### ZeroShotClassificationPipeline
 
 [[autodoc]] ZeroShotClassificationPipeline
diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx
index 4a4b59e9c1..6c32166389 100644
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -122,6 +122,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 
 [[autodoc]] AutoModelForVision2Seq
 
+## AutoModelForVisualQuestionAnswering
+
+[[autodoc]] AutoModelForVisualQuestionAnswering
+
 ## AutoModelForAudioClassification
 
 [[autodoc]] AutoModelForAudioClassification
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a47754e38e..b55ad04dc7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -377,6 +377,7 @@ _import_structure = {
         "TextGenerationPipeline",
         "TokenClassificationPipeline",
         "TranslationPipeline",
+        "VisualQuestionAnsweringPipeline",
         "ZeroShotClassificationPipeline",
         "ZeroShotImageClassificationPipeline",
         "pipeline",
@@ -758,6 +759,7 @@ else:
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_VISION_2_SEQ_MAPPING",
+            "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
             "MODEL_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
             "AutoModel",
@@ -783,6 +785,7 @@ else:
             "AutoModelForTableQuestionAnswering",
             "AutoModelForTokenClassification",
             "AutoModelForVision2Seq",
+            "AutoModelForVisualQuestionAnswering",
             "AutoModelWithLMHead",
         ]
     )
@@ -2961,6 +2964,7 @@ if TYPE_CHECKING:
         TextGenerationPipeline,
         TokenClassificationPipeline,
         TranslationPipeline,
+        VisualQuestionAnsweringPipeline,
         ZeroShotClassificationPipeline,
         ZeroShotImageClassificationPipeline,
         pipeline,
@@ -3291,6 +3295,7 @@ if TYPE_CHECKING:
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
+            MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
@@ -3316,6 +3321,7 @@ if TYPE_CHECKING:
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
             AutoModelForVision2Seq,
+            AutoModelForVisualQuestionAnswering,
             AutoModelWithLMHead,
         )
         from .models.bart import (
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index c3efdde105..09961bae14 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -64,6 +64,7 @@ else:
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_FOR_VISION_2_SEQ_MAPPING",
+        "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
         "MODEL_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
         "AutoModel",
@@ -89,6 +90,7 @@ else:
         "AutoModelForTableQuestionAnswering",
         "AutoModelForTokenClassification",
         "AutoModelForVision2Seq",
+        "AutoModelForVisualQuestionAnswering",
         "AutoModelWithLMHead",
     ]
 
@@ -202,6 +204,7 @@ if TYPE_CHECKING:
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
+            MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
@@ -227,6 +230,7 @@ if TYPE_CHECKING:
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
             AutoModelForVision2Seq,
+            AutoModelForVisualQuestionAnswering,
             AutoModelWithLMHead,
         )
 
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 0f970b938c..d87e5752c0 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -64,6 +64,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
         ("speech_to_text", "Speech2TextFeatureExtractor"),
         ("swin", "ViTFeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
+        ("vilt", "ViltFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bda7009c1e..3f2854f913 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -548,6 +548,12 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     ]
 )
 
+MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        ("vilt", "ViltForQuestionAnswering"),
+    ]
+)
+
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
@@ -706,6 +712,9 @@ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
+)
 MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
@@ -813,6 +822,17 @@ AutoModelForTableQuestionAnswering = auto_class_update(
 )
 
 
+class AutoModelForVisualQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
+
+
+AutoModelForVisualQuestionAnswering = auto_class_update(
+    AutoModelForVisualQuestionAnswering,
+    head_doc="visual question answering",
+    checkpoint_for_example="dandelin/vilt-b32-finetuned-vqa",
+)
+
+
 class AutoModelForTokenClassification(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
 
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 5980eed726..49a1d7f822 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -229,6 +229,7 @@ else:
             ("tapas", ("TapasTokenizer", None)),
             ("tapex", ("TapexTokenizer", None)),
             ("transfo-xl", ("TransfoXLTokenizer", None)),
+            ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 802eee9975..d5c59f3078 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -61,6 +61,7 @@ from .token_classification import (
     TokenClassificationArgumentHandler,
     TokenClassificationPipeline,
 )
+from .visual_question_answering import VisualQuestionAnsweringPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
 
@@ -94,6 +95,7 @@ if is_torch_available():
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
         AutoModel,
         AutoModelForAudioClassification,
         AutoModelForCausalLM,
@@ -109,6 +111,7 @@ if is_torch_available():
         AutoModelForSpeechSeq2Seq,
         AutoModelForTableQuestionAnswering,
         AutoModelForTokenClassification,
+        AutoModelForVisualQuestionAnswering,
     )
 if TYPE_CHECKING:
     from ..modeling_tf_utils import TFPreTrainedModel
@@ -121,6 +124,7 @@ logger = logging.get_logger(__name__)
 TASK_ALIASES = {
     "sentiment-analysis": "text-classification",
     "ner": "token-classification",
+    "vqa": "visual-question-answering",
 }
 SUPPORTED_TASKS = {
     "audio-classification": {
@@ -190,6 +194,19 @@ SUPPORTED_TASKS = {
         },
         "type": "text",
     },
+    "visual-question-answering": {
+        "impl": VisualQuestionAnsweringPipeline,
+        "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {
+                "pt": "dandelin/vilt-b32-finetuned-vqa",
+                "tokenizer": "dandelin/vilt-b32-finetuned-vqa",
+                "feature_extractor": "dandelin/vilt-b32-finetuned-vqa",
+            },
+        },
+        "type": "multimodal",
+    },
     "fill-mask": {
         "impl": FillMaskPipeline,
         "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
new file mode 100644
index 0000000000..34a7a3b10d
--- /dev/null
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -0,0 +1,115 @@
+from typing import Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
+from .base import PIPELINE_INIT_ARGS, Pipeline
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class VisualQuestionAnsweringPipeline(Pipeline):
+    """
+    Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
+    available in PyTorch.
+
+    This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifiers: `"visual-question-answering", "vqa"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING)
+
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, **kwargs):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, image: Union["Image.Image", str], question: str = None, **kwargs):
+        r"""
+        Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
+        below:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline({"image": image, "question": question})`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
+
+        Args:
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+            question (`str`, `List[str]`):
+                The question(s) asked. If given a single question, it can be broadcasted to multiple images.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+        Return:
+            A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
+            inputs = {"image": image, "question": question}
+        else:
+            """
+            Supports the following format
+            - {"image": image, "question": question}
+            - [{"image": image, "question": question}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def preprocess(self, inputs, padding=False, truncation=False):
+        image = load_image(inputs["image"])
+        model_inputs = self.tokenizer(
+            inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
+        )
+        image_features = self.feature_extractor(images=image, return_tensors=self.framework)
+        model_inputs.update(image_features)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5):
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        if self.framework == "pt":
+            probs = model_outputs.logits.sigmoid()[0]
+            scores, ids = probs.topk(top_k)
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+        return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 334dd33f87..2ba91c10ab 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -409,6 +409,9 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 MODEL_FOR_VISION_2_SEQ_MAPPING = None
 
 
+MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = None
+
+
 MODEL_MAPPING = None
 
 
@@ -576,6 +579,13 @@ class AutoModelForVision2Seq(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForVisualQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelWithLMHead(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
new file mode 100644
index 0000000000..d3315681f4
--- /dev/null
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -0,0 +1,115 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, is_vision_available
+from transformers.pipelines import pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+class VisualQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
+        examples = [
+            {
+                "image": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "question": "How many cats are there?",
+            },
+            {
+                "image": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "question": "How many cats are there?",
+            },
+        ]
+        return vqa_pipeline, examples
+
+    def run_pipeline_test(self, vqa_pipeline, examples):
+        outputs = vqa_pipeline(examples, top_k=1)
+        self.assertEqual(
+            outputs,
+            [
+                [{"score": ANY(float), "answer": ANY(str)}],
+                [{"score": ANY(float), "answer": ANY(str)}],
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        question = "How many cats are there?"
+
+        outputs = vqa_pipeline(image=image, question="How many cats are there?", top_k=2)
+        self.assertEqual(
+            outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}]
+        )
+
+        outputs = vqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}]
+        )
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        vqa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        question = "How many cats are there?"
+
+        outputs = vqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
+        )
+
+        outputs = vqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
+        )
+
+        outputs = vqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [[{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]] * 2,
+        )
+
+    @require_tf
+    @unittest.skip("Visual question answering not implemented in TF")
+    def test_small_model_tf(self):
+        pass