From 99e79054225c4547bb2870526a287320aef0bd32 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 25 Jan 2023 10:16:31 +0100
Subject: [PATCH] Supporting `ImageProcessor` in place of `FeatureExtractor`
 for pipelines (#20851)

* Fixing the pipeline with image processor.

* Update the slow test.

* Using only the first image processor.

* Include exclusion mecanism for Image processor.

* Do not handle Gitconfig, deemed as a bug.

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Remove `conversational` changes. They are not supposed to be here.

* Address first row of comments.

* Remove OneFormer modifications.

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/pipelines/__init__.py        | 32 +++++++++++++
 src/transformers/pipelines/base.py            | 12 ++++-
 .../pipelines/image_segmentation.py           | 20 +++++---
 src/transformers/utils/generic.py             | 13 +++--
 .../test_pipelines_audio_classification.py    |  2 +-
 ..._pipelines_automatic_speech_recognition.py |  2 +-
 tests/pipelines/test_pipelines_common.py      | 47 +++++++++++++++----
 .../test_pipelines_conversational.py          |  2 +-
 .../test_pipelines_depth_estimation.py        |  2 +-
 ...t_pipelines_document_question_answering.py |  2 +-
 .../test_pipelines_feature_extraction.py      |  2 +-
 tests/pipelines/test_pipelines_fill_mask.py   |  2 +-
 .../test_pipelines_image_classification.py    |  2 +-
 .../test_pipelines_image_segmentation.py      | 17 +++++--
 .../pipelines/test_pipelines_image_to_text.py |  2 +-
 .../test_pipelines_object_detection.py        |  2 +-
 .../test_pipelines_question_answering.py      |  2 +-
 .../pipelines/test_pipelines_summarization.py |  2 +-
 .../test_pipelines_text2text_generation.py    |  2 +-
 .../test_pipelines_text_classification.py     |  2 +-
 .../test_pipelines_text_generation.py         |  2 +-
 .../test_pipelines_token_classification.py    |  2 +-
 tests/pipelines/test_pipelines_translation.py |  2 +-
 .../test_pipelines_video_classification.py    |  2 +-
 ...est_pipelines_visual_question_answering.py |  2 +-
 tests/pipelines/test_pipelines_zero_shot.py   |  2 +-
 ...ipelines_zero_shot_image_classification.py |  2 +-
 ...st_pipelines_zero_shot_object_detection.py |  2 +-
 28 files changed, 138 insertions(+), 47 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index fd231930f1..992f14f26d 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -31,8 +31,10 @@ from huggingface_hub import model_info
 from ..configuration_utils import PretrainedConfig
 from ..dynamic_module_utils import get_class_from_dynamic_module
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
 from ..models.auto.configuration_auto import AutoConfig
 from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
 from ..models.auto.modeling_auto import AutoModelForDepthEstimation
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
@@ -374,6 +376,7 @@ SUPPORTED_TASKS = {
 }
 
 NO_FEATURE_EXTRACTOR_TASKS = set()
+NO_IMAGE_PROCESSOR_TASKS = set()
 NO_TOKENIZER_TASKS = set()
 # Those model configs are special, they are generic over their task, meaning
 # any tokenizer/feature_extractor might be use for a given model so we cannot
@@ -383,6 +386,7 @@ MULTI_MODEL_CONFIGS = {"SpeechEncoderDecoderConfig", "VisionEncoderDecoderConfig
 for task, values in SUPPORTED_TASKS.items():
     if values["type"] == "text":
         NO_FEATURE_EXTRACTOR_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
     elif values["type"] in {"audio", "image", "video"}:
         NO_TOKENIZER_TASKS.add(task)
     elif values["type"] != "multimodal":
@@ -482,6 +486,7 @@ def pipeline(
     config: Optional[Union[str, PretrainedConfig]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer, PreTrainedTokenizerFast]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+    image_processor: Optional[Union[str, BaseImageProcessor]] = None,
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
@@ -766,6 +771,7 @@ def pipeline(
 
     load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
+    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
 
     if (
         tokenizer is None
@@ -799,6 +805,8 @@ def pipeline(
 
     if task in NO_FEATURE_EXTRACTOR_TASKS:
         load_feature_extractor = False
+    if task in NO_IMAGE_PROCESSOR_TASKS:
+        load_image_processor = False
 
     if load_tokenizer:
         # Try to infer tokenizer from model or config name (if provided as str)
@@ -829,6 +837,27 @@ def pipeline(
                 tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
             )
 
+    if load_image_processor:
+        # Try to infer image processor from model or config name (if provided as str)
+        if image_processor is None:
+            if isinstance(model_name, str):
+                image_processor = model_name
+            elif isinstance(config, str):
+                image_processor = config
+            else:
+                # Impossible to guess what is the right image_processor here
+                raise Exception(
+                    "Impossible to guess which image processor to use. "
+                    "Please provide a PreTrainedImageProcessor class or a path/identifier "
+                    "to a pretrained image processor."
+                )
+
+        # Instantiate image_processor if needed
+        if isinstance(image_processor, (str, tuple)):
+            image_processor = AutoImageProcessor.from_pretrained(
+                image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+            )
+
     if load_feature_extractor:
         # Try to infer feature extractor from model or config name (if provided as str)
         if feature_extractor is None:
@@ -897,6 +926,9 @@ def pipeline(
     if torch_dtype is not None:
         kwargs["torch_dtype"] = torch_dtype
 
+    if image_processor is not None:
+        kwargs["image_processor"] = image_processor
+
     if device is not None:
         kwargs["device"] = device
 
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 28d6ee1937..3905d28d26 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -31,6 +31,7 @@ from packaging import version
 
 from ..dynamic_module_utils import custom_object_save
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
 from ..tokenization_utils import PreTrainedTokenizer
@@ -743,6 +744,7 @@ class Pipeline(_ScikitCompat):
         model: Union["PreTrainedModel", "TFPreTrainedModel"],
         tokenizer: Optional[PreTrainedTokenizer] = None,
         feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
+        image_processor: Optional[BaseImageProcessor] = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
         task: str = "",
@@ -759,6 +761,7 @@ class Pipeline(_ScikitCompat):
         self.model = model
         self.tokenizer = tokenizer
         self.feature_extractor = feature_extractor
+        self.image_processor = image_processor
         self.modelcard = modelcard
         self.framework = framework
         if is_torch_available() and self.framework == "pt":
@@ -1012,7 +1015,9 @@ class Pipeline(_ScikitCompat):
         if "TOKENIZERS_PARALLELISM" not in os.environ:
             logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor)
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
         dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
         model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
         final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
@@ -1121,7 +1126,10 @@ class ChunkPipeline(Pipeline):
             )
             num_workers = 1
         dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
-        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor)
+
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
         dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
         model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
         final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 9fdb0dc331..5be5b858dc 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -67,6 +67,12 @@ class ImageSegmentationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        if self.image_processor is None and self.feature_extractor is not None:
+            # Backward compatible change, if users called
+            # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+            # then we should keep working
+            self.image_processor = self.feature_extractor
+
         if self.framework == "tf":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
@@ -137,7 +143,7 @@ class ImageSegmentationPipeline(Pipeline):
     def preprocess(self, image):
         image = load_image(image)
         target_size = [(image.height, image.width)]
-        inputs = self.feature_extractor(images=[image], return_tensors="pt")
+        inputs = self.image_processor(images=[image], return_tensors="pt")
         inputs["target_size"] = target_size
         return inputs
 
@@ -152,10 +158,10 @@ class ImageSegmentationPipeline(Pipeline):
     ):
 
         fn = None
-        if subtask in {"panoptic", None} and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
-            fn = self.feature_extractor.post_process_panoptic_segmentation
-        elif subtask in {"instance", None} and hasattr(self.feature_extractor, "post_process_instance_segmentation"):
-            fn = self.feature_extractor.post_process_instance_segmentation
+        if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
+            fn = self.image_processor.post_process_panoptic_segmentation
+        elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
+            fn = self.image_processor.post_process_instance_segmentation
 
         if fn is not None:
             outputs = fn(
@@ -176,8 +182,8 @@ class ImageSegmentationPipeline(Pipeline):
                 score = segment["score"]
                 annotation.append({"score": score, "label": label, "mask": mask})
 
-        elif subtask in {"semantic", None} and hasattr(self.feature_extractor, "post_process_semantic_segmentation"):
-            outputs = self.feature_extractor.post_process_semantic_segmentation(
+        elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
+            outputs = self.image_processor.post_process_semantic_segmentation(
                 model_outputs, target_sizes=model_outputs["target_size"]
             )[0]
 
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index b601d1e61b..d138e0c1d4 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -29,9 +29,6 @@ import numpy as np
 from .import_utils import is_flax_available, is_tf_available, is_torch_available, is_torch_fx_proxy
 
 
-if is_tf_available():
-    import tensorflow as tf
-
 if is_flax_available():
     import jax.numpy as jnp
 
@@ -437,6 +434,8 @@ def transpose(array, axes=None):
     elif is_torch_tensor(array):
         return array.T if axes is None else array.permute(*axes)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.transpose(array, perm=axes)
     elif is_jax_tensor(array):
         return jnp.transpose(array, axes=axes)
@@ -454,6 +453,8 @@ def reshape(array, newshape):
     elif is_torch_tensor(array):
         return array.reshape(*newshape)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.reshape(array, newshape)
     elif is_jax_tensor(array):
         return jnp.reshape(array, newshape)
@@ -471,6 +472,8 @@ def squeeze(array, axis=None):
     elif is_torch_tensor(array):
         return array.squeeze() if axis is None else array.squeeze(dim=axis)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.squeeze(array, axis=axis)
     elif is_jax_tensor(array):
         return jnp.squeeze(array, axis=axis)
@@ -488,6 +491,8 @@ def expand_dims(array, axis):
     elif is_torch_tensor(array):
         return array.unsqueeze(dim=axis)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.expand_dims(array, axis=axis)
     elif is_jax_tensor(array):
         return jnp.expand_dims(array, axis=axis)
@@ -504,6 +509,8 @@ def tensor_size(array):
     elif is_torch_tensor(array):
         return array.numel()
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.size(array)
     elif is_jax_tensor(array):
         return array.size
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index 3f957132fd..2eccf8e6c3 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -27,7 +27,7 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta
 class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor)
 
         # test with a raw waveform
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 3a5dcc7f43..a1204eb9f9 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -61,7 +61,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel
         + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
     }
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 8da9ed89a6..aad1645830 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -33,8 +33,10 @@ from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_
 from requests.exceptions import HTTPError
 from transformers import (
     FEATURE_EXTRACTOR_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
     TOKENIZER_MAPPING,
     AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DistilBertForSequenceClassification,
@@ -154,8 +156,6 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
                 feature_extractor = None
         except Exception:
             feature_extractor = None
-    if hasattr(tiny_config, "image_size") and feature_extractor:
-        feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
 
     # Audio Spectogram Transformer specific.
     if feature_extractor.__class__.__name__ == "ASTFeatureExtractor":
@@ -168,9 +168,28 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
         feature_extractor = feature_extractor.__class__(
             feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
         )
+    # TODO remove this, once those have been moved to `image_processor`.
+    if hasattr(tiny_config, "image_size") and feature_extractor:
+        feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
     return feature_extractor
 
 
+def get_tiny_image_processor_from_checkpoint(checkpoint, tiny_config, image_processor_class):
+    try:
+        image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+    except Exception:
+        try:
+            if image_processor_class is not None:
+                image_processor = image_processor_class()
+            else:
+                image_processor = None
+        except Exception:
+            image_processor = None
+    if hasattr(tiny_config, "image_size") and image_processor:
+        image_processor = image_processor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
+    return image_processor
+
+
 class ANY:
     def __init__(self, *_types):
         self._types = _types
@@ -184,7 +203,9 @@ class ANY:
 
 class PipelineTestCaseMeta(type):
     def __new__(mcs, name, bases, dct):
-        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class):
+        def gen_test(
+            ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class, image_processor_class
+        ):
             @skipIf(
                 tiny_config is None,
                 "TinyConfig does not exist, make sure that you defined a `_CONFIG_FOR_DOC` variable in the modeling"
@@ -231,16 +252,21 @@ class PipelineTestCaseMeta(type):
                         self.skipTest(f"Ignoring {ModelClass}, cannot create a simple tokenizer")
                 else:
                     tokenizer = None
+
                 feature_extractor = get_tiny_feature_extractor_from_checkpoint(
                     checkpoint, tiny_config, feature_extractor_class
                 )
 
-                if tokenizer is None and feature_extractor is None:
+                image_processor = get_tiny_image_processor_from_checkpoint(
+                    checkpoint, tiny_config, image_processor_class
+                )
+
+                if tokenizer is None and feature_extractor is None and image_processor:
                     self.skipTest(
-                        f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor (PerceiverConfig with"
-                        " no FastTokenizer ?)"
+                        f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor or image_processor"
+                        " (PerceiverConfig with no FastTokenizer ?)"
                     )
-                pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
+                pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor, image_processor)
                 if pipeline is None:
                     # The test can disable itself, but it should be very marginal
                     # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
@@ -283,6 +309,10 @@ class PipelineTestCaseMeta(type):
                         feature_extractor_name = (
                             feature_extractor_class.__name__ if feature_extractor_class else "nofeature_extractor"
                         )
+                        image_processor_class = IMAGE_PROCESSOR_MAPPING.get(configuration, None)
+                        image_processor_name = (
+                            image_processor_class.__name__ if image_processor_class else "noimage_processor"
+                        )
                         if not tokenizer_classes:
                             # We need to test even if there are no tokenizers.
                             tokenizer_classes = [None]
@@ -300,7 +330,7 @@ class PipelineTestCaseMeta(type):
                             else:
                                 tokenizer_name = "notokenizer"
 
-                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"
+                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}_{image_processor_name}"
 
                             if tokenizer_class is not None or feature_extractor_class is not None:
                                 dct[test_name] = gen_test(
@@ -309,6 +339,7 @@ class PipelineTestCaseMeta(type):
                                     tiny_config,
                                     tokenizer_class,
                                     feature_extractor_class,
+                                    image_processor_class,
                                 )
 
         @abstractmethod
diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py
index 39ad2175dc..a3a2c3b694 100644
--- a/tests/pipelines/test_pipelines_conversational.py
+++ b/tests/pipelines/test_pipelines_conversational.py
@@ -53,7 +53,7 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
         else []
     )
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
         return conversation_agent, [Conversation("Hi there!")]
 
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index d42ba2a067..593cd940a9 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -47,7 +47,7 @@ class DepthEstimationPipelineTests(unittest.TestCase, metaclass=PipelineTestCase
 
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         depth_estimator = DepthEstimationPipeline(model=model, feature_extractor=feature_extractor)
         return depth_estimator, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index c73decda0a..8ace9cfe2b 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -59,7 +59,7 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=Pipeli
 
     @require_pytesseract
     @require_vision
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         dqa_pipeline = pipeline(
             "document-question-answering", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
         )
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index 28cde51a8e..4aa58873e8 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -175,7 +175,7 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
             raise ValueError("We expect lists of floats, nothing else")
         return shape
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         if tokenizer is None:
             self.skipTest("No tokenizer")
             return
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index 760c475524..a19902a61d 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -206,7 +206,7 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
         unmasker.tokenizer.pad_token = None
         self.run_pipeline_test(unmasker, [])
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         if tokenizer is None or tokenizer.mask_token_id is None:
             self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 8c83888ba0..90612d21b7 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -49,7 +49,7 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2)
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 889a4ba041..8f022e68da 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -26,6 +26,7 @@ from transformers import (
     MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
     MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
     AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForImageSegmentation,
     AutoModelForInstanceSegmentation,
     DetrForSegmentation,
@@ -80,8 +81,10 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     }
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
+        image_segmenter = ImageSegmentationPipeline(
+            model=model, feature_extractor=feature_extractor, image_processor=image_processor
+        )
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -139,7 +142,11 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
         ]
         outputs = image_segmenter(
-            batch, threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0, batch_size=batch_size
+            batch,
+            threshold=0.0,
+            mask_threshold=0,
+            overlap_mask_area_threshold=0,
+            batch_size=batch_size,
         )
         self.assertEqual(len(batch), len(outputs))
         self.assertEqual(len(outputs[0]), n)
@@ -188,10 +195,10 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
 
         model = AutoModelForImageSegmentation.from_pretrained(model_id)
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
         image_segmenter = ImageSegmentationPipeline(
             model=model,
-            feature_extractor=feature_extractor,
+            image_processor=image_processor,
             subtask="panoptic",
             threshold=0.0,
             mask_threshold=0.0,
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 0e1e805f9b..c6fdaa3102 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -36,7 +36,7 @@ class ImageToTextPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta
     model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
     tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index 043e73cb6e..caf9001ece 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -51,7 +51,7 @@ else:
 class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index 496b1685d9..84447ac230 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -31,7 +31,7 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
     tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         if isinstance(model.config, LxmertConfig):
             # This is an bimodal model, we need to find a more consistent way
             # to switch on those models.
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index 781716b5ba..aa8cd86fb8 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -34,7 +34,7 @@ class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMe
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
         return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
 
diff --git a/tests/pipelines/test_pipelines_text2text_generation.py b/tests/pipelines/test_pipelines_text2text_generation.py
index 772190fb63..4fe9e6d150 100644
--- a/tests/pipelines/test_pipelines_text2text_generation.py
+++ b/tests/pipelines/test_pipelines_text2text_generation.py
@@ -34,7 +34,7 @@ class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTest
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
         return generator, ["Something to write", "Something else"]
 
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 80e8e2559f..849751b917 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -129,7 +129,7 @@ class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestC
         outputs = text_classifier("Birds are a type of animal")
         self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
         return text_classifier, ["HuggingFace is in", "This is another test"]
 
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 922a4e24b2..5dc3e5e34f 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -143,7 +143,7 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
             ],
         )
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
         return text_generator, ["This is a test", "Another test"]
 
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 2e44448e13..1999be6494 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -37,7 +37,7 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
     model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
         return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
 
diff --git a/tests/pipelines/test_pipelines_translation.py b/tests/pipelines/test_pipelines_translation.py
index d8de606f69..3fc19a9064 100644
--- a/tests/pipelines/test_pipelines_translation.py
+++ b/tests/pipelines/test_pipelines_translation.py
@@ -34,7 +34,7 @@ class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         if isinstance(model.config, MBartConfig):
             src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
             translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 25ddcfaf2d..2e5a777fd5 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -35,7 +35,7 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta
 class VideoClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         example_video_filepath = hf_hub_download(
             repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
         )
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index bf3a532b10..796b39a267 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -36,7 +36,7 @@ else:
 class VisualQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
         examples = [
             {
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 5c78db1aa9..57b77d791b 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -30,7 +30,7 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT
     model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         classifier = ZeroShotClassificationPipeline(
             model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
         )
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index d0396f4e9a..41451f9386 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -37,7 +37,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     # and only CLIP would be there for now.
     # model_mapping = {CLIPConfig: CLIPModel}
 
-    # def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    # def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
     #     if tokenizer is None:
     #         # Side effect of no Fast Tokenizer class for these model, so skipping
     #         # But the slow tokenizer test should still run as they're quite small
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index c48b8c381d..304a9ca020 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -36,7 +36,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase, metaclass=Pipeline
 
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor):
         object_detector = pipeline(
             "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection"
         )