From 3b44d2f0424ce46411fd0b6beb72aaafdf10c361 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 8 Oct 2024 13:34:28 +0100
Subject: [PATCH] Image pipelines spec compliance (#33899)

* Update many similar visual pipelines

* Add input tests

* Add ImageToText as well

* Add output tests

* Add output tests

* Add output tests

* OutputElement -> Output

* Correctly test elements

* make fixup

* fix typo in the task list

* Fix VQA testing

* Add copyright to image_classification.py

* Revert changes to VQA pipeline because outputs have differences - will move to another PR

* make fixup

* Remove deprecation warnings
---
 .../pipelines/depth_estimation.py             | 26 +++++++++++-----
 .../pipelines/image_classification.py         | 31 +++++++++++++++----
 .../pipelines/image_segmentation.py           | 18 +++++++----
 src/transformers/pipelines/image_to_text.py   | 18 +++++++----
 .../pipelines/object_detection.py             | 13 +++++---
 .../zero_shot_image_classification.py         | 26 ++++++++++------
 .../test_pipelines_depth_estimation.py        |  5 +++
 .../test_pipelines_image_classification.py    |  7 +++++
 .../test_pipelines_image_segmentation.py      |  6 ++++
 .../pipelines/test_pipelines_image_to_text.py |  5 +++
 .../test_pipelines_object_detection.py        |  6 +++-
 ...ipelines_zero_shot_image_classification.py |  6 ++++
 tests/test_pipeline_mixin.py                  | 28 +++++++++++++++--
 13 files changed, 152 insertions(+), 43 deletions(-)

diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index be45bdde8d..f70f8d85c1 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import List, Union
 
 import numpy as np
@@ -50,12 +51,12 @@ class DepthEstimationPipeline(Pipeline):
         requires_backends(self, "vision")
         self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
 
-    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
         """
         Predict the depth(s) of the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -65,9 +66,10 @@ class DepthEstimationPipeline(Pipeline):
                 The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                 Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                 images.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
+            parameters (`Dict`, *optional*):
+                A dictionary of argument names to parameter values, to control pipeline behaviour.
+                The only parameter available right now is `timeout`, which is the length of time, in seconds,
+                that the pipeline should wait before giving up on trying to download an image.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -79,12 +81,22 @@ class DepthEstimationPipeline(Pipeline):
             - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
             - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
-    def _sanitize_parameters(self, timeout=None, **kwargs):
+    def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
         preprocess_params = {}
         if timeout is not None:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = timeout
+        if isinstance(parameters, dict) and "timeout" in parameters:
+            preprocess_params["timeout"] = parameters["timeout"]
         return preprocess_params, {}, {}
 
     def preprocess(self, image, timeout=None):
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 8aaa66e6c4..20ad72e790 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -1,3 +1,17 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
 from typing import List, Union
 
 import numpy as np
@@ -99,6 +113,9 @@ class ImageClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
         preprocess_params = {}
         if timeout is not None:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = timeout
         postprocess_params = {}
         if top_k is not None:
@@ -109,12 +126,12 @@ class ImageClassificationPipeline(Pipeline):
             postprocess_params["function_to_apply"] = function_to_apply
         return preprocess_params, {}, postprocess_params
 
-    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -142,9 +159,6 @@ class ImageClassificationPipeline(Pipeline):
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -156,7 +170,12 @@ class ImageClassificationPipeline(Pipeline):
             - **label** (`str`) -- The label identified by the model.
             - **score** (`int`) -- The score attributed by the model for that label.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index e0fd3b7d85..0ac653fd1e 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, List, Union
 
 import numpy as np
@@ -90,16 +91,19 @@ class ImageSegmentationPipeline(Pipeline):
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
         if "timeout" in kwargs:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_kwargs["timeout"] = kwargs["timeout"]
 
         return preprocess_kwargs, {}, postprocess_kwargs
 
-    def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
+    def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
         Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing an HTTP(S) link pointing to an image
@@ -118,9 +122,6 @@ class ImageSegmentationPipeline(Pipeline):
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                 Mask overlap threshold to eliminate small, disconnected segments.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
@@ -136,7 +137,12 @@ class ImageSegmentationPipeline(Pipeline):
             - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
               "object" described by the label and the mask.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, image, subtask=None, timeout=None):
         image = load_image(image, timeout=timeout)
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 91d44c46d2..4beaa48192 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from typing import List, Union
 
 from ..utils import (
@@ -80,6 +81,9 @@ class ImageToTextPipeline(Pipeline):
         if prompt is not None:
             preprocess_params["prompt"] = prompt
         if timeout is not None:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = timeout
 
         if max_new_tokens is not None:
@@ -94,12 +98,12 @@ class ImageToTextPipeline(Pipeline):
 
         return preprocess_params, forward_params, {}
 
-    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a HTTP(s) link pointing to an image
@@ -113,16 +117,18 @@ class ImageToTextPipeline(Pipeline):
 
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
 
             - **generated_text** (`str`) -- The generated text.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, image, prompt=None, timeout=None):
         image = load_image(image, timeout=timeout)
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index d3e2135790..c135b1e131 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, List, Union
 
 from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
@@ -63,6 +64,9 @@ class ObjectDetectionPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
         preprocess_params = {}
         if "timeout" in kwargs:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_kwargs = {}
         if "threshold" in kwargs:
@@ -74,7 +78,7 @@ class ObjectDetectionPipeline(Pipeline):
         Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing an HTTP(S) link pointing to an image
@@ -85,9 +89,6 @@ class ObjectDetectionPipeline(Pipeline):
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
             threshold (`float`, *optional*, defaults to 0.5):
                 The probability necessary to make a prediction.
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
 
         Return:
             A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
@@ -100,7 +101,9 @@ class ObjectDetectionPipeline(Pipeline):
             - **score** (`float`) -- The score attributed by the model for that label.
             - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
         """
-
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs and "inputs" not in kwargs:
+            kwargs["inputs"] = kwargs.pop("images")
         return super().__call__(*args, **kwargs)
 
     def preprocess(self, image, timeout=None):
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index b7e13e782e..253c684fcb 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import UserDict
 from typing import List, Union
 
@@ -73,12 +74,12 @@ class ZeroShotImageClassificationPipeline(Pipeline):
             else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
-    def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs):
+    def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -93,13 +94,6 @@ class ZeroShotImageClassificationPipeline(Pipeline):
                 replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                 already formatted.
 
-            timeout (`float`, *optional*, defaults to None):
-                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
-                the call may block forever.
-
-            tokenizer_kwargs (`dict`, *optional*):
-                Additional dictionary of keyword arguments passed along to the tokenizer.
-
         Return:
             A list of dictionaries containing one entry per proposed label. Each dictionary contains the
             following keys:
@@ -107,17 +101,29 @@ class ZeroShotImageClassificationPipeline(Pipeline):
             - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                 0 and 1, computed as the `softmax` of `logits_per_image`.
         """
-        return super().__call__(images, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `image`
+        if "images" in kwargs:
+            image = kwargs.pop("images")
+        if image is None:
+            raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
+        return super().__call__(image, **kwargs)
 
     def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
         preprocess_params = {}
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
         if "timeout" in kwargs:
+            warnings.warn(
+                "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning
+            )
             preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
         if tokenizer_kwargs is not None:
+            warnings.warn(
+                "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers",
+                FutureWarning,
+            )
             preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
 
         return preprocess_params, {}, {}
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index 16edeed453..03cc86abec 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -14,11 +14,13 @@
 
 import unittest
 
+from huggingface_hub import DepthEstimationOutput
 from huggingface_hub.utils import insecure_hashlib
 
 from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
 from transformers.pipelines import DepthEstimationPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -94,6 +96,9 @@ class DepthEstimationPipelineTests(unittest.TestCase):
             outputs,
         )
 
+        for single_output in outputs:
+            compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput)
+
     @require_tf
     @unittest.skip(reason="Depth estimation is not implemented in TF")
     def test_small_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 823c66c16f..8c95489253 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+from huggingface_hub import ImageClassificationOutputElement
+
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
     TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
@@ -23,6 +25,7 @@ from transformers import (
 )
 from transformers.pipelines import ImageClassificationPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -121,6 +124,10 @@ class ImageClassificationPipelineTests(unittest.TestCase):
             ],
         )
 
+        for single_output in outputs:
+            for output_element in single_output:
+                compare_pipeline_output_to_hub_spec(output_element, ImageClassificationOutputElement)
+
     @require_torch
     def test_small_model_pt(self):
         small_model = "hf-internal-testing/tiny-random-vit"
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 523bd0b52b..c62909331f 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -20,6 +20,7 @@ import datasets
 import numpy as np
 import requests
 from datasets import load_dataset
+from huggingface_hub import ImageSegmentationOutputElement
 from huggingface_hub.utils import insecure_hashlib
 
 from transformers import (
@@ -36,6 +37,7 @@ from transformers import (
     pipeline,
 )
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -168,6 +170,10 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
             f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
         )
 
+        for single_output in outputs:
+            for output_element in single_output:
+                compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement)
+
     @require_tf
     @unittest.skip(reason="Image segmentation not implemented in TF")
     def test_small_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 43a796da46..7033164324 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -15,10 +15,12 @@
 import unittest
 
 import requests
+from huggingface_hub import ImageToTextOutput
 
 from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
 from transformers.pipelines import ImageToTextPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     require_tf,
     require_torch,
@@ -103,6 +105,9 @@ class ImageToTextPipelineTests(unittest.TestCase):
             [{"generated_text": "growth"}],
         )
 
+        for single_output in outputs:
+            compare_pipeline_output_to_hub_spec(single_output, ImageToTextOutput)
+
     @require_torch
     def test_small_model_pt(self):
         pipe = pipeline("image-to-text", model="hf-internal-testing/tiny-random-vit-gpt2")
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index f14e5e6b68..448257bd05 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+from huggingface_hub import ObjectDetectionOutputElement
+
 from transformers import (
     MODEL_FOR_OBJECT_DETECTION_MAPPING,
     AutoFeatureExtractor,
@@ -22,7 +24,8 @@ from transformers import (
     is_vision_available,
     pipeline,
 )
-from transformers.testing_utils import (
+from transformers.testing_utils import (  #
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_pytesseract,
@@ -101,6 +104,7 @@ class ObjectDetectionPipelineTests(unittest.TestCase):
                         "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
                     },
                 )
+                compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement)
 
     @require_tf
     @unittest.skip(reason="Object detection not implemented in TF")
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index b57adf609d..bbeaeff3c1 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -14,9 +14,12 @@
 
 import unittest
 
+from huggingface_hub import ZeroShotImageClassificationOutputElement
+
 from transformers import is_vision_available
 from transformers.pipelines import pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_tf,
@@ -127,6 +130,9 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
             ],
         )
 
+        for single_output in output:
+            compare_pipeline_output_to_hub_spec(single_output, ZeroShotImageClassificationOutputElement)
+
     @require_torch
     def test_small_model_pt_fp16(self):
         self.test_small_model_pt(torch_dtype="float16")
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index e3c650a0e0..329d505723 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -25,9 +25,27 @@ from pathlib import Path
 from textwrap import dedent
 from typing import get_args
 
-from huggingface_hub import AudioClassificationInput, AutomaticSpeechRecognitionInput
+from huggingface_hub import (
+    AudioClassificationInput,
+    AutomaticSpeechRecognitionInput,
+    DepthEstimationInput,
+    ImageClassificationInput,
+    ImageSegmentationInput,
+    ImageToTextInput,
+    ObjectDetectionInput,
+    ZeroShotImageClassificationInput,
+)
 
-from transformers.pipelines import AudioClassificationPipeline, AutomaticSpeechRecognitionPipeline
+from transformers.pipelines import (
+    AudioClassificationPipeline,
+    AutomaticSpeechRecognitionPipeline,
+    DepthEstimationPipeline,
+    ImageClassificationPipeline,
+    ImageSegmentationPipeline,
+    ImageToTextPipeline,
+    ObjectDetectionPipeline,
+    ZeroShotImageClassificationPipeline,
+)
 from transformers.testing_utils import (
     is_pipeline_test,
     require_decord,
@@ -105,6 +123,12 @@ task_to_pipeline_and_spec_mapping = {
     # task spec in the HF Hub
     "audio-classification": (AudioClassificationPipeline, AudioClassificationInput),
     "automatic-speech-recognition": (AutomaticSpeechRecognitionPipeline, AutomaticSpeechRecognitionInput),
+    "depth-estimation": (DepthEstimationPipeline, DepthEstimationInput),
+    "image-classification": (ImageClassificationPipeline, ImageClassificationInput),
+    "image-segmentation": (ImageSegmentationPipeline, ImageSegmentationInput),
+    "image-to-text": (ImageToTextPipeline, ImageToTextInput),
+    "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput),
+    "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput),
 }
 
 for task, task_info in pipeline_test_mapping.items():