From 3b44d2f0424ce46411fd0b6beb72aaafdf10c361 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 13:34:28 +0100 Subject: [PATCH] Image pipelines spec compliance (#33899) * Update many similar visual pipelines * Add input tests * Add ImageToText as well * Add output tests * Add output tests * Add output tests * OutputElement -> Output * Correctly test elements * make fixup * fix typo in the task list * Fix VQA testing * Add copyright to image_classification.py * Revert changes to VQA pipeline because outputs have differences - will move to another PR * make fixup * Remove deprecation warnings --- .../pipelines/depth_estimation.py | 26 +++++++++++----- .../pipelines/image_classification.py | 31 +++++++++++++++---- .../pipelines/image_segmentation.py | 18 +++++++---- src/transformers/pipelines/image_to_text.py | 18 +++++++---- .../pipelines/object_detection.py | 13 +++++--- .../zero_shot_image_classification.py | 26 ++++++++++------ .../test_pipelines_depth_estimation.py | 5 +++ .../test_pipelines_image_classification.py | 7 +++++ .../test_pipelines_image_segmentation.py | 6 ++++ .../pipelines/test_pipelines_image_to_text.py | 5 +++ .../test_pipelines_object_detection.py | 6 +++- ...ipelines_zero_shot_image_classification.py | 6 ++++ tests/test_pipeline_mixin.py | 28 +++++++++++++++-- 13 files changed, 152 insertions(+), 43 deletions(-) diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py index be45bdde8d..f70f8d85c1 100644 --- a/src/transformers/pipelines/depth_estimation.py +++ b/src/transformers/pipelines/depth_estimation.py @@ -1,3 +1,4 @@ +import warnings from typing import List, Union import numpy as np @@ -50,12 +51,12 @@ class DepthEstimationPipeline(Pipeline): requires_backends(self, "vision") self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES) - def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs): + def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): """ Predict the depth(s) of the image(s) passed as inputs. Args: - images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing a http link pointing to an image @@ -65,9 +66,10 @@ class DepthEstimationPipeline(Pipeline): The pipeline accepts either a single image or a batch of images, which must then be passed as a string. Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL images. - timeout (`float`, *optional*, defaults to None): - The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and - the call may block forever. + parameters (`Dict`, *optional*): + A dictionary of argument names to parameter values, to control pipeline behaviour. + The only parameter available right now is `timeout`, which is the length of time, in seconds, + that the pipeline should wait before giving up on trying to download an image. Return: A dictionary or a list of dictionaries containing result. If the input is a single image, will return a @@ -79,12 +81,22 @@ class DepthEstimationPipeline(Pipeline): - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`. - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`. """ - return super().__call__(images, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) - def _sanitize_parameters(self, timeout=None, **kwargs): + def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs): preprocess_params = {} if timeout is not None: + warnings.warn( + "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning + ) preprocess_params["timeout"] = timeout + if isinstance(parameters, dict) and "timeout" in parameters: + preprocess_params["timeout"] = parameters["timeout"] return preprocess_params, {}, {} def preprocess(self, image, timeout=None): diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py index 8aaa66e6c4..20ad72e790 100644 --- a/src/transformers/pipelines/image_classification.py +++ b/src/transformers/pipelines/image_classification.py @@ -1,3 +1,17 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings from typing import List, Union import numpy as np @@ -99,6 +113,9 @@ class ImageClassificationPipeline(Pipeline): def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None): preprocess_params = {} if timeout is not None: + warnings.warn( + "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning + ) preprocess_params["timeout"] = timeout postprocess_params = {} if top_k is not None: @@ -109,12 +126,12 @@ class ImageClassificationPipeline(Pipeline): postprocess_params["function_to_apply"] = function_to_apply return preprocess_params, {}, postprocess_params - def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs): + def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): """ Assign labels to the image(s) passed as inputs. Args: - images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing a http link pointing to an image @@ -142,9 +159,6 @@ class ImageClassificationPipeline(Pipeline): top_k (`int`, *optional*, defaults to 5): The number of top labels that will be returned by the pipeline. If the provided number is higher than the number of labels available in the model configuration, it will default to the number of labels. - timeout (`float`, *optional*, defaults to None): - The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and - the call may block forever. Return: A dictionary or a list of dictionaries containing result. If the input is a single image, will return a @@ -156,7 +170,12 @@ class ImageClassificationPipeline(Pipeline): - **label** (`str`) -- The label identified by the model. - **score** (`int`) -- The score attributed by the model for that label. """ - return super().__call__(images, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the image-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) def preprocess(self, image, timeout=None): image = load_image(image, timeout=timeout) diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index e0fd3b7d85..0ac653fd1e 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Union import numpy as np @@ -90,16 +91,19 @@ class ImageSegmentationPipeline(Pipeline): if "overlap_mask_area_threshold" in kwargs: postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"] if "timeout" in kwargs: + warnings.warn( + "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning + ) preprocess_kwargs["timeout"] = kwargs["timeout"] return preprocess_kwargs, {}, postprocess_kwargs - def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]: + def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]: """ Perform segmentation (detect masks & classes) in the image(s) passed as inputs. Args: - images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing an HTTP(S) link pointing to an image @@ -118,9 +122,6 @@ class ImageSegmentationPipeline(Pipeline): Threshold to use when turning the predicted masks into binary values. overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5): Mask overlap threshold to eliminate small, disconnected segments. - timeout (`float`, *optional*, defaults to None): - The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and - the call may block forever. Return: A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a @@ -136,7 +137,12 @@ class ImageSegmentationPipeline(Pipeline): - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the "object" described by the label and the mask. """ - return super().__call__(images, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the image-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) def preprocess(self, image, subtask=None, timeout=None): image = load_image(image, timeout=timeout) diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py index 91d44c46d2..4beaa48192 100644 --- a/src/transformers/pipelines/image_to_text.py +++ b/src/transformers/pipelines/image_to_text.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from typing import List, Union from ..utils import ( @@ -80,6 +81,9 @@ class ImageToTextPipeline(Pipeline): if prompt is not None: preprocess_params["prompt"] = prompt if timeout is not None: + warnings.warn( + "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning + ) preprocess_params["timeout"] = timeout if max_new_tokens is not None: @@ -94,12 +98,12 @@ class ImageToTextPipeline(Pipeline): return preprocess_params, forward_params, {} - def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs): + def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): """ Assign labels to the image(s) passed as inputs. Args: - images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing a HTTP(s) link pointing to an image @@ -113,16 +117,18 @@ class ImageToTextPipeline(Pipeline): generate_kwargs (`Dict`, *optional*): Pass it to send all of these arguments directly to `generate` allowing full control of this function. - timeout (`float`, *optional*, defaults to None): - The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and - the call may block forever. Return: A list or a list of list of `dict`: Each result comes as a dictionary with the following key: - **generated_text** (`str`) -- The generated text. """ - return super().__call__(images, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) def preprocess(self, image, prompt=None, timeout=None): image = load_image(image, timeout=timeout) diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py index d3e2135790..c135b1e131 100644 --- a/src/transformers/pipelines/object_detection.py +++ b/src/transformers/pipelines/object_detection.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Union from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends @@ -63,6 +64,9 @@ class ObjectDetectionPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): preprocess_params = {} if "timeout" in kwargs: + warnings.warn( + "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning + ) preprocess_params["timeout"] = kwargs["timeout"] postprocess_kwargs = {} if "threshold" in kwargs: @@ -74,7 +78,7 @@ class ObjectDetectionPipeline(Pipeline): Detect objects (bounding boxes & classes) in the image(s) passed as inputs. Args: - images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing an HTTP(S) link pointing to an image @@ -85,9 +89,6 @@ class ObjectDetectionPipeline(Pipeline): same format: all as HTTP(S) links, all as local paths, or all as PIL images. threshold (`float`, *optional*, defaults to 0.5): The probability necessary to make a prediction. - timeout (`float`, *optional*, defaults to None): - The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and - the call may block forever. Return: A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single @@ -100,7 +101,9 @@ class ObjectDetectionPipeline(Pipeline): - **score** (`float`) -- The score attributed by the model for that label. - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size. """ - + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs and "inputs" not in kwargs: + kwargs["inputs"] = kwargs.pop("images") return super().__call__(*args, **kwargs) def preprocess(self, image, timeout=None): diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index b7e13e782e..253c684fcb 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -1,3 +1,4 @@ +import warnings from collections import UserDict from typing import List, Union @@ -73,12 +74,12 @@ class ZeroShotImageClassificationPipeline(Pipeline): else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES ) - def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs): + def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs): """ Assign labels to the image(s) passed as inputs. Args: - images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing a http link pointing to an image @@ -93,13 +94,6 @@ class ZeroShotImageClassificationPipeline(Pipeline): replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are already formatted. - timeout (`float`, *optional*, defaults to None): - The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and - the call may block forever. - - tokenizer_kwargs (`dict`, *optional*): - Additional dictionary of keyword arguments passed along to the tokenizer. - Return: A list of dictionaries containing one entry per proposed label. Each dictionary contains the following keys: @@ -107,17 +101,29 @@ class ZeroShotImageClassificationPipeline(Pipeline): - **score** (`float`) -- The score attributed by the model to that label. It is a value between 0 and 1, computed as the `softmax` of `logits_per_image`. """ - return super().__call__(images, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `image` + if "images" in kwargs: + image = kwargs.pop("images") + if image is None: + raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!") + return super().__call__(image, **kwargs) def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs): preprocess_params = {} if "candidate_labels" in kwargs: preprocess_params["candidate_labels"] = kwargs["candidate_labels"] if "timeout" in kwargs: + warnings.warn( + "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning + ) preprocess_params["timeout"] = kwargs["timeout"] if "hypothesis_template" in kwargs: preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] if tokenizer_kwargs is not None: + warnings.warn( + "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers", + FutureWarning, + ) preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs return preprocess_params, {}, {} diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py index 16edeed453..03cc86abec 100644 --- a/tests/pipelines/test_pipelines_depth_estimation.py +++ b/tests/pipelines/test_pipelines_depth_estimation.py @@ -14,11 +14,13 @@ import unittest +from huggingface_hub import DepthEstimationOutput from huggingface_hub.utils import insecure_hashlib from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available from transformers.pipelines import DepthEstimationPipeline, pipeline from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_tf, @@ -94,6 +96,9 @@ class DepthEstimationPipelineTests(unittest.TestCase): outputs, ) + for single_output in outputs: + compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput) + @require_tf @unittest.skip(reason="Depth estimation is not implemented in TF") def test_small_model_tf(self): diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py index 823c66c16f..8c95489253 100644 --- a/tests/pipelines/test_pipelines_image_classification.py +++ b/tests/pipelines/test_pipelines_image_classification.py @@ -14,6 +14,8 @@ import unittest +from huggingface_hub import ImageClassificationOutputElement + from transformers import ( MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, @@ -23,6 +25,7 @@ from transformers import ( ) from transformers.pipelines import ImageClassificationPipeline, pipeline from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_tf, @@ -121,6 +124,10 @@ class ImageClassificationPipelineTests(unittest.TestCase): ], ) + for single_output in outputs: + for output_element in single_output: + compare_pipeline_output_to_hub_spec(output_element, ImageClassificationOutputElement) + @require_torch def test_small_model_pt(self): small_model = "hf-internal-testing/tiny-random-vit" diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py index 523bd0b52b..c62909331f 100644 --- a/tests/pipelines/test_pipelines_image_segmentation.py +++ b/tests/pipelines/test_pipelines_image_segmentation.py @@ -20,6 +20,7 @@ import datasets import numpy as np import requests from datasets import load_dataset +from huggingface_hub import ImageSegmentationOutputElement from huggingface_hub.utils import insecure_hashlib from transformers import ( @@ -36,6 +37,7 @@ from transformers import ( pipeline, ) from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_tf, @@ -168,6 +170,10 @@ class ImageSegmentationPipelineTests(unittest.TestCase): f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}", ) + for single_output in outputs: + for output_element in single_output: + compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement) + @require_tf @unittest.skip(reason="Image segmentation not implemented in TF") def test_small_model_tf(self): diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py index 43a796da46..7033164324 100644 --- a/tests/pipelines/test_pipelines_image_to_text.py +++ b/tests/pipelines/test_pipelines_image_to_text.py @@ -15,10 +15,12 @@ import unittest import requests +from huggingface_hub import ImageToTextOutput from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available from transformers.pipelines import ImageToTextPipeline, pipeline from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, require_tf, require_torch, @@ -103,6 +105,9 @@ class ImageToTextPipelineTests(unittest.TestCase): [{"generated_text": "growth"}], ) + for single_output in outputs: + compare_pipeline_output_to_hub_spec(single_output, ImageToTextOutput) + @require_torch def test_small_model_pt(self): pipe = pipeline("image-to-text", model="hf-internal-testing/tiny-random-vit-gpt2") diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py index f14e5e6b68..448257bd05 100644 --- a/tests/pipelines/test_pipelines_object_detection.py +++ b/tests/pipelines/test_pipelines_object_detection.py @@ -14,6 +14,8 @@ import unittest +from huggingface_hub import ObjectDetectionOutputElement + from transformers import ( MODEL_FOR_OBJECT_DETECTION_MAPPING, AutoFeatureExtractor, @@ -22,7 +24,8 @@ from transformers import ( is_vision_available, pipeline, ) -from transformers.testing_utils import ( +from transformers.testing_utils import ( # + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_pytesseract, @@ -101,6 +104,7 @@ class ObjectDetectionPipelineTests(unittest.TestCase): "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)}, }, ) + compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement) @require_tf @unittest.skip(reason="Object detection not implemented in TF") diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py index b57adf609d..bbeaeff3c1 100644 --- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py +++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py @@ -14,9 +14,12 @@ import unittest +from huggingface_hub import ZeroShotImageClassificationOutputElement + from transformers import is_vision_available from transformers.pipelines import pipeline from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_tf, @@ -127,6 +130,9 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase): ], ) + for single_output in output: + compare_pipeline_output_to_hub_spec(single_output, ZeroShotImageClassificationOutputElement) + @require_torch def test_small_model_pt_fp16(self): self.test_small_model_pt(torch_dtype="float16") diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index e3c650a0e0..329d505723 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -25,9 +25,27 @@ from pathlib import Path from textwrap import dedent from typing import get_args -from huggingface_hub import AudioClassificationInput, AutomaticSpeechRecognitionInput +from huggingface_hub import ( + AudioClassificationInput, + AutomaticSpeechRecognitionInput, + DepthEstimationInput, + ImageClassificationInput, + ImageSegmentationInput, + ImageToTextInput, + ObjectDetectionInput, + ZeroShotImageClassificationInput, +) -from transformers.pipelines import AudioClassificationPipeline, AutomaticSpeechRecognitionPipeline +from transformers.pipelines import ( + AudioClassificationPipeline, + AutomaticSpeechRecognitionPipeline, + DepthEstimationPipeline, + ImageClassificationPipeline, + ImageSegmentationPipeline, + ImageToTextPipeline, + ObjectDetectionPipeline, + ZeroShotImageClassificationPipeline, +) from transformers.testing_utils import ( is_pipeline_test, require_decord, @@ -105,6 +123,12 @@ task_to_pipeline_and_spec_mapping = { # task spec in the HF Hub "audio-classification": (AudioClassificationPipeline, AudioClassificationInput), "automatic-speech-recognition": (AutomaticSpeechRecognitionPipeline, AutomaticSpeechRecognitionInput), + "depth-estimation": (DepthEstimationPipeline, DepthEstimationInput), + "image-classification": (ImageClassificationPipeline, ImageClassificationInput), + "image-segmentation": (ImageSegmentationPipeline, ImageSegmentationInput), + "image-to-text": (ImageToTextPipeline, ImageToTextInput), + "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput), + "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput), } for task, task_info in pipeline_test_mapping.items():