Add Visual Question Answering (VQA) pipeline (#17286)

* wip * rebase * all tests pass * rebase * ready for PR * address comments * fix styles * add require_torch to pipeline test * remove remote image to improve CI consistency * address comments; fix tf/flax tests * address comments; fix tf/flax tests * fix tests; add alias * repo consistency tests * Update src/transformers/pipelines/visual_question_answering.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * address comments * Update src/transformers/pipelines/visual_question_answering.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * merge * Update src/transformers/models/auto/modeling_auto.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * merge Co-authored-by: Sijun He <sijunhe@Sijuns-MacBook-Pro.local> Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2022-06-13 19:49:44 +08:00
parent a5282ab4bc
commit 66336dc183
11 changed files with 300 additions and 0 deletions
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -38,6 +38,7 @@ There are two categories of pipeline abstractions to be aware about:
  - [`Text2TextGenerationPipeline`]
  - [`TokenClassificationPipeline`]
  - [`TranslationPipeline`]
  - [`VisualQuestionAnsweringPipeline`]
  - [`ZeroShotClassificationPipeline`]
  - [`ZeroShotImageClassificationPipeline`]
@@ -423,6 +424,12 @@ See [`TokenClassificationPipeline`] for all details.
    - __call__
    - all
 ### VisualQuestionAnsweringPipeline
 [[autodoc]] VisualQuestionAnsweringPipeline
    - __call__
    - all
 ### ZeroShotClassificationPipeline
 [[autodoc]] ZeroShotClassificationPipeline
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -122,6 +122,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 [[autodoc]] AutoModelForVision2Seq
 ## AutoModelForVisualQuestionAnswering
 [[autodoc]] AutoModelForVisualQuestionAnswering
 ## AutoModelForAudioClassification
 [[autodoc]] AutoModelForAudioClassification
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -377,6 +377,7 @@ _import_structure = {
        "TextGenerationPipeline",
        "TokenClassificationPipeline",
        "TranslationPipeline",
        "VisualQuestionAnsweringPipeline",
        "ZeroShotClassificationPipeline",
        "ZeroShotImageClassificationPipeline",
        "pipeline",
@@ -758,6 +759,7 @@ else:
            "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
            "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
            "MODEL_FOR_VISION_2_SEQ_MAPPING",
            "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
            "MODEL_MAPPING",
            "MODEL_WITH_LM_HEAD_MAPPING",
            "AutoModel",
@@ -783,6 +785,7 @@ else:
            "AutoModelForTableQuestionAnswering",
            "AutoModelForTokenClassification",
            "AutoModelForVision2Seq",
            "AutoModelForVisualQuestionAnswering",
            "AutoModelWithLMHead",
        ]
    )
@@ -2961,6 +2964,7 @@ if TYPE_CHECKING:
        TextGenerationPipeline,
        TokenClassificationPipeline,
        TranslationPipeline,
        VisualQuestionAnsweringPipeline,
        ZeroShotClassificationPipeline,
        ZeroShotImageClassificationPipeline,
        pipeline,
@@ -3291,6 +3295,7 @@ if TYPE_CHECKING:
            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
            MODEL_FOR_VISION_2_SEQ_MAPPING,
            MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
            MODEL_MAPPING,
            MODEL_WITH_LM_HEAD_MAPPING,
            AutoModel,
@@ -3316,6 +3321,7 @@ if TYPE_CHECKING:
            AutoModelForTableQuestionAnswering,
            AutoModelForTokenClassification,
            AutoModelForVision2Seq,
            AutoModelForVisualQuestionAnswering,
            AutoModelWithLMHead,
        )
        from .models.bart import (
--- a/src/transformers/models/auto/init.py
+++ b/src/transformers/models/auto/init.py
@@ -64,6 +64,7 @@ else:
        "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
        "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
        "MODEL_FOR_VISION_2_SEQ_MAPPING",
        "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
        "MODEL_MAPPING",
        "MODEL_WITH_LM_HEAD_MAPPING",
        "AutoModel",
@@ -89,6 +90,7 @@ else:
        "AutoModelForTableQuestionAnswering",
        "AutoModelForTokenClassification",
        "AutoModelForVision2Seq",
        "AutoModelForVisualQuestionAnswering",
        "AutoModelWithLMHead",
    ]
@@ -202,6 +204,7 @@ if TYPE_CHECKING:
            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
            MODEL_FOR_VISION_2_SEQ_MAPPING,
            MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
            MODEL_MAPPING,
            MODEL_WITH_LM_HEAD_MAPPING,
            AutoModel,
@@ -227,6 +230,7 @@ if TYPE_CHECKING:
            AutoModelForTableQuestionAnswering,
            AutoModelForTokenClassification,
            AutoModelForVision2Seq,
            AutoModelForVisualQuestionAnswering,
            AutoModelWithLMHead,
        )
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -64,6 +64,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("speech_to_text", "Speech2TextFeatureExtractor"),
        ("swin", "ViTFeatureExtractor"),
        ("van", "ConvNextFeatureExtractor"),
        ("vilt", "ViltFeatureExtractor"),
        ("vit", "ViTFeatureExtractor"),
        ("vit_mae", "ViTFeatureExtractor"),
        ("wav2vec2", "Wav2Vec2FeatureExtractor"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -548,6 +548,12 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
    ]
 )
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
    [
        ("vilt", "ViltForQuestionAnswering"),
    ]
 )
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
    [
        # Model for Token Classification mapping
@@ -706,6 +712,9 @@ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
 MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
@@ -813,6 +822,17 @@ AutoModelForTableQuestionAnswering = auto_class_update(
 )
 class AutoModelForVisualQuestionAnswering(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 AutoModelForVisualQuestionAnswering = auto_class_update(
    AutoModelForVisualQuestionAnswering,
    head_doc="visual question answering",
    checkpoint_for_example="dandelin/vilt-b32-finetuned-vqa",
 )
 class AutoModelForTokenClassification(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -229,6 +229,7 @@ else:
            ("tapas", ("TapasTokenizer", None)),
            ("tapex", ("TapexTokenizer", None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
            ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
            ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@@ -61,6 +61,7 @@ from .token_classification import (
    TokenClassificationArgumentHandler,
    TokenClassificationPipeline,
 )
 from .visual_question_answering import VisualQuestionAnsweringPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
@@ -94,6 +95,7 @@ if is_torch_available():
        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
        MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
        AutoModel,
        AutoModelForAudioClassification,
        AutoModelForCausalLM,
@@ -109,6 +111,7 @@ if is_torch_available():
        AutoModelForSpeechSeq2Seq,
        AutoModelForTableQuestionAnswering,
        AutoModelForTokenClassification,
        AutoModelForVisualQuestionAnswering,
    )
 if TYPE_CHECKING:
    from ..modeling_tf_utils import TFPreTrainedModel
@@ -121,6 +124,7 @@ logger = logging.get_logger(__name__)
 TASK_ALIASES = {
    "sentiment-analysis": "text-classification",
    "ner": "token-classification",
    "vqa": "visual-question-answering",
 }
 SUPPORTED_TASKS = {
    "audio-classification": {
@@ -190,6 +194,19 @@ SUPPORTED_TASKS = {
        },
        "type": "text",
    },
    "visual-question-answering": {
        "impl": VisualQuestionAnsweringPipeline,
        "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
        "tf": (),
        "default": {
            "model": {
                "pt": "dandelin/vilt-b32-finetuned-vqa",
                "tokenizer": "dandelin/vilt-b32-finetuned-vqa",
                "feature_extractor": "dandelin/vilt-b32-finetuned-vqa",
            },
        },
        "type": "multimodal",
    },
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -0,0 +1,115 @@
 from typing import Union
 from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 if is_vision_available():
    from PIL import Image
    from ..image_utils import load_image
 if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 logger = logging.get_logger(__name__)
@add_end_docstrings(PIPELINE_INIT_ARGS)
 class VisualQuestionAnsweringPipeline(Pipeline):
    """
    Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
    available in PyTorch.
    This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifiers: `"visual-question-answering", "vqa"`.
    The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
    the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING)
    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, **kwargs):
        preprocess_params, postprocess_params = {}, {}
        if padding is not None:
            preprocess_params["padding"] = padding
        if truncation is not None:
            preprocess_params["truncation"] = truncation
        if top_k is not None:
            postprocess_params["top_k"] = top_k
        return preprocess_params, {}, postprocess_params
    def __call__(self, image: Union["Image.Image", str], question: str = None, **kwargs):
        r"""
        Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
        below:
        - `pipeline(image=image, question=question)`
        - `pipeline({"image": image, "question": question})`
        - `pipeline([{"image": image, "question": question}])`
        - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
        Args:
            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:
                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly
                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
                broadcasted to multiple questions.
            question (`str`, `List[str]`):
                The question(s) asked. If given a single question, it can be broadcasted to multiple images.
            top_k (`int`, *optional*, defaults to 5):
                The number of top labels that will be returned by the pipeline. If the provided number is higher than
                the number of labels available in the model configuration, it will default to the number of labels.
        Return:
            A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
            - **label** (`str`) -- The label identified by the model.
            - **score** (`int`) -- The score attributed by the model for that label.
        """
        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
            inputs = {"image": image, "question": question}
        else:
            """
            Supports the following format
            - {"image": image, "question": question}
            - [{"image": image, "question": question}]
            - Generator and datasets
            """
            inputs = image
        results = super().__call__(inputs, **kwargs)
        return results
    def preprocess(self, inputs, padding=False, truncation=False):
        image = load_image(inputs["image"])
        model_inputs = self.tokenizer(
            inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
        )
        image_features = self.feature_extractor(images=image, return_tensors=self.framework)
        model_inputs.update(image_features)
        return model_inputs
    def _forward(self, model_inputs):
        model_outputs = self.model(**model_inputs)
        return model_outputs
    def postprocess(self, model_outputs, top_k=5):
        if top_k > self.model.config.num_labels:
            top_k = self.model.config.num_labels
        if self.framework == "pt":
            probs = model_outputs.logits.sigmoid()[0]
            scores, ids = probs.topk(top_k)
        else:
            raise ValueError(f"Unsupported framework: {self.framework}")
        scores = scores.tolist()
        ids = ids.tolist()
        return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -409,6 +409,9 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 MODEL_FOR_VISION_2_SEQ_MAPPING = None
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = None
 MODEL_MAPPING = None
@@ -576,6 +579,13 @@ class AutoModelForVision2Seq(metaclass=DummyObject):
        requires_backends(self, ["torch"])
 class AutoModelForVisualQuestionAnswering(metaclass=DummyObject):
    _backends = ["torch"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch"])
 class AutoModelWithLMHead(metaclass=DummyObject):
    _backends = ["torch"]
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -0,0 +1,115 @@
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 from transformers import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, is_vision_available
 from transformers.pipelines import pipeline
 from transformers.testing_utils import (
    is_pipeline_test,
    nested_simplify,
    require_tf,
    require_torch,
    require_vision,
    slow,
 )
 from .test_pipelines_common import ANY, PipelineTestCaseMeta
 if is_vision_available():
    from PIL import Image
 else:
    class Image:
        @staticmethod
        def open(*args, **kwargs):
            pass
@is_pipeline_test
@require_torch
@require_vision
 class VisualQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
    model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
    def get_test_pipeline(self, model, tokenizer, feature_extractor):
        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
        examples = [
            {
                "image": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
                "question": "How many cats are there?",
            },
            {
                "image": "./tests/fixtures/tests_samples/COCO/000000039769.png",
                "question": "How many cats are there?",
            },
        ]
        return vqa_pipeline, examples
    def run_pipeline_test(self, vqa_pipeline, examples):
        outputs = vqa_pipeline(examples, top_k=1)
        self.assertEqual(
            outputs,
            [
                [{"score": ANY(float), "answer": ANY(str)}],
                [{"score": ANY(float), "answer": ANY(str)}],
            ],
        )
    @require_torch
    def test_small_model_pt(self):
        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
        question = "How many cats are there?"
        outputs = vqa_pipeline(image=image, question="How many cats are there?", top_k=2)
        self.assertEqual(
            outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}]
        )
        outputs = vqa_pipeline({"image": image, "question": question}, top_k=2)
        self.assertEqual(
            outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}]
        )
    @slow
    @require_torch
    def test_large_model_pt(self):
        vqa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
        question = "How many cats are there?"
        outputs = vqa_pipeline(image=image, question=question, top_k=2)
        self.assertEqual(
            nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
        )
        outputs = vqa_pipeline({"image": image, "question": question}, top_k=2)
        self.assertEqual(
            nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
        )
        outputs = vqa_pipeline(
            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
        )
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [[{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]] * 2,
        )
    @require_tf
    @unittest.skip("Visual question answering not implemented in TF")
    def test_small_model_tf(self):
        pass