Vision processors - replace FE with IPs (#20590)

* Replace FE references with IPs * Update processor tests * Update src/transformers/models/clip/processing_clip.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/clip/processing_clip.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update warning messages v4.27 -> v5 * Fixup * Update Chinese CLIP processor * Add feature_extractor property * Add attributes * Add tests Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2022-12-09 10:48:34 +00:00
parent 704027f0ef
commit a95fd35426
22 changed files with 681 additions and 375 deletions
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -287,8 +287,8 @@ class AutoProcessor:
        raise ValueError(
            f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
-            "tokenizer or a feature extractor for this model. Make sure the repository contains the files of at least "
+            "tokenizer, an image processor or a feature extractor for this model. Make sure the repository contains"
-            "one of those processing classes."
+            "the files of at least one of those processing classes."
        )
    @staticmethod
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -15,39 +15,56 @@
 """
 Image/Text processor class for Chinese-CLIP
 """
 import warnings
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 class ChineseCLIPProcessor(ProcessorMixin):
    r"""
-    Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP feature extractor and a Chinese-CLIP tokenizer into
+    Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
-    a single processor.
+    single processor.
-    [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPFeatureExtractor`] and
+    [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
-    [`BertTokenizerFast`]. See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more
+    See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
    information.
    Args:
-        feature_extractor ([`ChineseCLIPFeatureExtractor`]):
+        image_processor ([`ChineseCLIPImageProcessor`]):
-            The feature extractor is a required input.
+            The image processor is a required input.
        tokenizer ([`BertTokenizerFast`]):
            The tokenizer is a required input.
    """
-    feature_extractor_class = "ChineseCLIPFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ChineseCLIPImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-    def __init__(self, feature_extractor, tokenizer):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(feature_extractor, tokenizer)
+        if "feature_extractor" in kwargs:
-        self.current_processor = self.feature_extractor
+            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        doctsring of the above two methods for more information.
+        of the above two methods for more information.
        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
@@ -84,7 +101,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
        if images is not None:
-            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
        if text is not None and images is not None:
            encoding["pixel_values"] = image_features.pixel_values
@@ -111,5 +128,13 @@ class ChineseCLIPProcessor(ProcessorMixin):
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
-        feature_extractor_input_names = self.feature_extractor.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -15,37 +15,54 @@
 """
 Image/Text processor class for CLIP
 """
 import warnings
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 class CLIPProcessor(ProcessorMixin):
    r"""
-    Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
+    Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
-    [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
+    [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
    [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
    Args:
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        image_processor ([`CLIPImageProcessor`]):
-            The feature extractor is a required input.
+            The image processor is a required input.
        tokenizer ([`CLIPTokenizerFast`]):
            The tokenizer is a required input.
    """
-    feature_extractor_class = "CLIPFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-    def __init__(self, feature_extractor, tokenizer):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(feature_extractor, tokenizer)
+        if "feature_extractor" in kwargs:
-        self.current_processor = self.feature_extractor
+            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        doctsring of the above two methods for more information.
+        of the above two methods for more information.
        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
@@ -82,7 +99,7 @@ class CLIPProcessor(ProcessorMixin):
            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
        if images is not None:
-            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
        if text is not None and images is not None:
            encoding["pixel_values"] = image_features.pixel_values
@@ -109,5 +126,21 @@ class CLIPProcessor(ProcessorMixin):
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
-        feature_extractor_input_names = self.feature_extractor.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -15,38 +15,54 @@
 """
 Image/Text processor class for CLIPSeg
 """
 import warnings
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 class CLIPSegProcessor(ProcessorMixin):
    r"""
-    Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single
+    Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
    processor.
-    [`CLIPSegProcessor`] offers all the functionalities of [`ViTFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
+    [`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
    [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
    Args:
-        feature_extractor ([`ViTFeatureExtractor`]):
+        image_processor ([`ViTImageProcessor`]):
-            The feature extractor is a required input.
+            The image processor is a required input.
        tokenizer ([`CLIPTokenizerFast`]):
            The tokenizer is a required input.
    """
-    feature_extractor_class = "ViTFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ViTImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-    def __init__(self, feature_extractor, tokenizer):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(feature_extractor, tokenizer)
+        if "feature_extractor" in kwargs:
-        self.current_processor = self.feature_extractor
+            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        ViTFeatureExtractor's [`~ViTFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
-        doctsring of the above two methods for more information.
+        the above two methods for more information.
        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
@@ -83,7 +99,7 @@ class CLIPSegProcessor(ProcessorMixin):
            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
        if images is not None:
-            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
        if text is not None and images is not None:
            encoding["pixel_values"] = image_features.pixel_values
@@ -106,3 +122,19 @@ class CLIPSegProcessor(ProcessorMixin):
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -24,7 +24,7 @@ from ...processing_utils import ProcessorMixin
 class DonutProcessor(ProcessorMixin):
    r"""
-    Constructs a Donut processor which wraps a Donut feature extractor and an XLMRoBERTa tokenizer into a single
+    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
    processor.
    [`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
@@ -32,8 +32,8 @@ class DonutProcessor(ProcessorMixin):
    [`~DonutProcessor.decode`] for more information.
    Args:
-        feature_extractor ([`DonutFeatureExtractor`]):
+        image_processor ([`DonutFeatureExtractor`]):
-            An instance of [`DonutFeatureExtractor`]. The feature extractor is a required input.
+            An instance of [`DonutFeatureExtractor`]. The image processor is a required input.
        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    """
@@ -44,7 +44,7 @@ class DonutProcessor(ProcessorMixin):
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        if "feature_extractor" in kwargs:
            warnings.warn(
-                "The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
@@ -176,8 +176,15 @@ class DonutProcessor(ProcessorMixin):
    @property
    def feature_extractor_class(self):
        warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            " instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -15,6 +15,8 @@
 """
 Image/Text processor class for FLAVA
 """
 import warnings
 from typing import List, Optional, Union
 from ...image_utils import ImageInput
@@ -25,21 +27,36 @@ from ...utils import TensorType
 class FlavaProcessor(ProcessorMixin):
    r"""
-    Constructs a FLAVA processor which wraps a FLAVA feature extractor and a FLAVA tokenizer into a single processor.
+    Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
    [`FlavaProcessor`] offers all the functionalities of [`FlavaFeatureExtractor`] and [`BertTokenizerFast`]. See the
    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
    Args:
-        feature_extractor ([`FlavaFeatureExtractor`]): The feature extractor is a required input.
+        image_processor ([`FlavaFeatureExtractor`]): The image processor is a required input.
        tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
    """
-    feature_extractor_class = "FlavaFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "FlavaFeatureExtractor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-    def __init__(self, feature_extractor, tokenizer):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(feature_extractor, tokenizer)
+        if "feature_extractor" in kwargs:
-        self.current_processor = self.feature_extractor
+            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
    def __call__(
        self,
@@ -93,7 +110,7 @@ class FlavaProcessor(ProcessorMixin):
                **kwargs,
            )
        if images is not None:
-            image_features = self.feature_extractor(
+            image_features = self.image_processor(
                images,
                return_image_mask=return_image_mask,
                return_codebook_pixels=return_codebook_pixels,
@@ -126,5 +143,21 @@ class FlavaProcessor(ProcessorMixin):
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
-        feature_extractor_input_names = self.feature_extractor.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -15,6 +15,8 @@
 """
 Processor class for LayoutLMv2.
 """
 import warnings
 from typing import List, Optional, Union
 from ...processing_utils import ProcessorMixin
@@ -24,26 +26,44 @@ from ...utils import TensorType
 class LayoutLMv2Processor(ProcessorMixin):
    r"""
-    Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
+    Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
    single processor.
    [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
-    It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
-    to get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
+    get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
    [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
    Args:
-        feature_extractor (`LayoutLMv2FeatureExtractor`):
+        image_processor (`LayoutLMv2ImageProcessor`):
-            An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
            An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
    """
-    feature_extractor_class = "LayoutLMv2FeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "LayoutLMv2ImageProcessor"
    tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
    def __call__(
        self,
        images,
@@ -68,37 +88,36 @@ class LayoutLMv2Processor(ProcessorMixin):
        **kwargs
    ) -> BatchEncoding:
        """
-        This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
+        This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
-        [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
        bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
+        together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
        arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
        Please refer to the docstring of the above two methods for more information.
        """
        # verify input
-        if self.feature_extractor.apply_ocr and (boxes is not None):
+        if self.image_processor.apply_ocr and (boxes is not None):
            raise ValueError(
-                "You cannot provide bounding boxes "
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
                "if you initialized the feature extractor with apply_ocr set to True."
            )
-        if self.feature_extractor.apply_ocr and (word_labels is not None):
+        if self.image_processor.apply_ocr and (word_labels is not None):
            raise ValueError(
-                "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
            )
        if return_overflowing_tokens is True and return_offsets_mapping is False:
            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
-        # first, apply the feature extractor
+        # first, apply the image processor
-        features = self.feature_extractor(images=images, return_tensors=return_tensors)
+        features = self.image_processor(images=images, return_tensors=return_tensors)
        # second, apply the tokenizer
-        if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
            if isinstance(text, str):
-                text = [text]  # add batch dimension (as the feature extractor always adds a batch dimension)
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
            text_pair = features["words"]
        encoded_inputs = self.tokenizer(
@@ -162,3 +181,19 @@ class LayoutLMv2Processor(ProcessorMixin):
    @property
    def model_input_names(self):
        return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -15,6 +15,8 @@
 """
 Processor class for LayoutLMv3.
 """
 import warnings
 from typing import List, Optional, Union
 from ...processing_utils import ProcessorMixin
@@ -24,26 +26,44 @@ from ...utils import TensorType
 class LayoutLMv3Processor(ProcessorMixin):
    r"""
-    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 feature extractor and a LayoutLMv3 tokenizer into a
+    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
    single processor.
    [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
-    It first uses [`LayoutLMv3FeatureExtractor`] to resize and normalize document images, and optionally applies OCR to
+    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
    get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
    [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
    Args:
-        feature_extractor (`LayoutLMv3FeatureExtractor`):
+        image_processor (`LayoutLMv3ImageProcessor`):
-            An instance of [`LayoutLMv3FeatureExtractor`]. The feature extractor is a required input.
+            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
            An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
    """
-    feature_extractor_class = "LayoutLMv3FeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "LayoutLMv3ImageProcessor"
    tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
    def __call__(
        self,
        images,
@@ -68,35 +88,34 @@ class LayoutLMv3Processor(ProcessorMixin):
        **kwargs
    ) -> BatchEncoding:
        """
-        This method first forwards the `images` argument to [`~LayoutLMv3FeatureExtractor.__call__`]. In case
+        This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
-        [`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
        bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
-        together with resized and normalized `pixel_values`. In case [`LayoutLMv3FeatureExtractor`] was initialized
+        together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
-        with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user
+        `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
-        along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
+        with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
        resized and normalized `pixel_values`.
        Please refer to the docstring of the above two methods for more information.
        """
        # verify input
-        if self.feature_extractor.apply_ocr and (boxes is not None):
+        if self.image_processor.apply_ocr and (boxes is not None):
            raise ValueError(
-                "You cannot provide bounding boxes "
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
                "if you initialized the feature extractor with apply_ocr set to True."
            )
-        if self.feature_extractor.apply_ocr and (word_labels is not None):
+        if self.image_processor.apply_ocr and (word_labels is not None):
            raise ValueError(
-                "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
            )
-        # first, apply the feature extractor
+        # first, apply the image processor
-        features = self.feature_extractor(images=images, return_tensors=return_tensors)
+        features = self.image_processor(images=images, return_tensors=return_tensors)
        # second, apply the tokenizer
-        if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
            if isinstance(text, str):
-                text = [text]  # add batch dimension (as the feature extractor always adds a batch dimension)
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
            text_pair = features["words"]
        encoded_inputs = self.tokenizer(
@@ -160,3 +179,19 @@ class LayoutLMv3Processor(ProcessorMixin):
    @property
    def model_input_names(self):
        return ["input_ids", "bbox", "attention_mask", "pixel_values"]
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -81,10 +81,10 @@ def box_iou(boxes1, boxes2):
 class OwlViTImageProcessor(BaseImageProcessor):
    r"""
-    Constructs an OWL-ViT feature extractor.
+    Constructs an OWL-ViT image processor.
-    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should
-    should refer to this superclass for more information regarding those methods.
+    refer to this superclass for more information regarding those methods.
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
@@ -115,7 +115,6 @@ class OwlViTImageProcessor(BaseImageProcessor):
        image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    """
    model_input_names = ["pixel_values"]
    def __init__(
@@ -139,7 +138,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
        crop_size = get_size_dict(crop_size, default_to_square=True)
        # Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
-        # vision feature extractor method `rescale` as it would be set as an attribute during the super().__init__
+        # vision image processor method `rescale` as it would be set as an attribute during the super().__init__
        # call. This is for backwards compatibility.
        if "rescale" in kwargs:
            rescale_val = kwargs.pop("rescale")
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -16,6 +16,7 @@
 Image/Text processor class for OWL-ViT
 """
 import warnings
 from typing import List
 import numpy as np
@@ -28,29 +29,44 @@ from ...tokenization_utils_base import BatchEncoding
 class OwlViTProcessor(ProcessorMixin):
    r"""
-    Constructs an OWL-ViT processor which wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
+    Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
-    into a single processor that interits both the feature extractor and tokenizer functionalities. See the
+    into a single processor that interits both the image processor and tokenizer functionalities. See the
    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
    Args:
-        feature_extractor ([`OwlViTFeatureExtractor`]):
+        image_processor ([`OwlViTImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
            The tokenizer is a required input.
    """
-    feature_extractor_class = "OwlViTFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "OwlViTImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-    def __init__(self, *args, **kwargs):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(*args, **kwargs)
+        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
    def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
        """
        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        doctsring of the above two methods for more information.
+        of the above two methods for more information.
        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
@@ -137,13 +153,13 @@ class OwlViTProcessor(ProcessorMixin):
        if query_images is not None:
            encoding = BatchEncoding()
-            query_pixel_values = self.feature_extractor(
+            query_pixel_values = self.image_processor(
                query_images, return_tensors=return_tensors, **kwargs
            ).pixel_values
            encoding["query_pixel_values"] = query_pixel_values
        if images is not None:
-            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
        if text is not None and images is not None:
            encoding["pixel_values"] = image_features.pixel_values
@@ -158,17 +174,17 @@ class OwlViTProcessor(ProcessorMixin):
    def post_process(self, *args, **kwargs):
        """
-        This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process`]. Please refer to the
+        This method forwards all its arguments to [`OwlViTImageProcessor.post_process`]. Please refer to the docstring
-        docstring of this method for more information.
+        of this method for more information.
        """
-        return self.feature_extractor.post_process(*args, **kwargs)
+        return self.image_processor.post_process(*args, **kwargs)
    def post_process_image_guided_detection(self, *args, **kwargs):
        """
-        This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process_one_shot_object_detection`].
+        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
        Please refer to the docstring of this method for more information.
        """
-        return self.feature_extractor.post_process_image_guided_detection(*args, **kwargs)
+        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
    def batch_decode(self, *args, **kwargs):
        """
@@ -183,3 +199,19 @@ class OwlViTProcessor(ProcessorMixin):
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -42,7 +42,7 @@ class TrOCRProcessor(ProcessorMixin):
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        if "feature_extractor" in kwargs:
            warnings.warn(
-                "The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
@@ -124,8 +124,15 @@ class TrOCRProcessor(ProcessorMixin):
    @property
    def feature_extractor_class(self):
        warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            " instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -16,6 +16,7 @@
 Processor class for ViLT.
 """
 import warnings
 from typing import List, Optional, Union
 from ...processing_utils import ProcessorMixin
@@ -25,23 +26,38 @@ from ...utils import TensorType
 class ViltProcessor(ProcessorMixin):
    r"""
-    Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
+    Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
    [`ViltProcessor`] offers all the functionalities of [`ViltFeatureExtractor`] and [`BertTokenizerFast`]. See the
    docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
    Args:
-        feature_extractor (`ViltFeatureExtractor`):
+        image_processor (`ViltFeatureExtractor`):
-            An instance of [`ViltFeatureExtractor`]. The feature extractor is a required input.
+            An instance of [`ViltFeatureExtractor`]. The image processor is a required input.
        tokenizer (`BertTokenizerFast`):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """
-    feature_extractor_class = "ViltFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ViltFeatureExtractor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-    def __init__(self, feature_extractor, tokenizer):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(feature_extractor, tokenizer)
+        if "feature_extractor" in kwargs:
-        self.current_processor = self.feature_extractor
+            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
    def __call__(
        self,
@@ -88,8 +104,8 @@ class ViltProcessor(ProcessorMixin):
            **kwargs,
        )
        # add pixel_values + pixel_mask
-        encoding_feature_extractor = self.feature_extractor(images, return_tensors=return_tensors)
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
-        encoding.update(encoding_feature_extractor)
+        encoding.update(encoding_image_processor)
        return encoding
@@ -110,5 +126,21 @@ class ViltProcessor(ProcessorMixin):
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
-        feature_extractor_input_names = self.feature_extractor.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -44,7 +44,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        if "feature_extractor" in kwargs:
            warnings.warn(
-                "The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
@@ -132,10 +132,18 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
    @property
    def feature_extractor_class(self):
        warnings.warn(
-            "`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            " instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -15,38 +15,55 @@
 """
 Image/Text processor class for XCLIP
 """
 import warnings
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 class XCLIPProcessor(ProcessorMixin):
    r"""
-    Constructs an X-CLIP processor which wraps a VideoMAE feature extractor and a CLIP tokenizer into a single
+    Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
    processor.
-    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEFeatureExtractor`] and [`CLIPTokenizerFast`]. See
+    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    the [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
+    [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
    Args:
-        feature_extractor ([`VideoMAEFeatureExtractor`]):
+        image_processor ([`VideoMAEImageProcessor`]):
-            The feature extractor is a required input.
+            The image processor is a required input.
        tokenizer ([`CLIPTokenizerFast`]):
            The tokenizer is a required input.
    """
-    feature_extractor_class = "VideoMAEFeatureExtractor"
+    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "VideoMAEImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-    def __init__(self, feature_extractor, tokenizer):
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        super().__init__(feature_extractor, tokenizer)
+        if "feature_extractor" in kwargs:
-        self.current_processor = self.feature_extractor
+            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
-        VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `videos` is not `None`. Please refer to
+        VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
-        the doctsring of the above two methods for more information.
+        doctsring of the above two methods for more information.
        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
@@ -84,7 +101,7 @@ class XCLIPProcessor(ProcessorMixin):
            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
        if videos is not None:
-            image_features = self.feature_extractor(videos, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
        if text is not None and videos is not None:
            encoding["pixel_values"] = image_features.pixel_values
@@ -111,3 +128,19 @@ class XCLIPProcessor(ProcessorMixin):
    @property
    def model_input_names(self):
        return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -158,7 +158,8 @@ class ProcessorMixin(PushToHubMixin):
        <Tip>
        This class method is simply calling the feature extractor
-        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
        [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
        methods above for more information.
--- a/tests/models/chinese_clip/test_processor_chinese_clip.py
+++ b/tests/models/chinese_clip/test_processor_chinese_clip.py
@@ -30,7 +30,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
 if is_vision_available():
    from PIL import Image
-    from transformers import ChineseCLIPFeatureExtractor, ChineseCLIPProcessor
+    from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor
@require_vision
@@ -62,7 +62,7 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        feature_extractor_map = {
+        image_processor_map = {
            "do_resize": True,
            "size": {"height": 224, "width": 224},
            "do_center_crop": True,
@@ -72,9 +72,9 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
            "image_std": [0.26862954, 0.26130258, 0.27577711],
            "do_convert_rgb": True,
        }
-        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        self.image_processor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(feature_extractor_map, fp)
+            json.dump(image_processor_map, fp)
    def get_tokenizer(self, **kwargs):
        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -82,8 +82,8 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
    def get_rust_tokenizer(self, **kwargs):
        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return ChineseCLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return ChineseCLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -102,13 +102,13 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer_slow = self.get_tokenizer()
        tokenizer_fast = self.get_rust_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
-        processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
+        processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
        processor_slow.save_pretrained(self.tmpdirname)
        processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-        processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
+        processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
        processor_fast.save_pretrained(self.tmpdirname)
        processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
@@ -118,19 +118,17 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
        self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
        self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
-        self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.feature_extractor, ChineseCLIPFeatureExtractor)
+        self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor)
-        self.assertIsInstance(processor_fast.feature_extractor, ChineseCLIPFeatureExtractor)
+        self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = ChineseCLIPProcessor(
+        processor = ChineseCLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
            tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
        )
        processor.save_pretrained(self.tmpdirname)
        tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False)
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
        processor = ChineseCLIPProcessor.from_pretrained(
            self.tmpdirname, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
@@ -139,28 +137,28 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, ChineseCLIPFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor)
-    def test_feature_extractor(self):
+    def test_image_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        image_input = self.prepare_image_inputs()
-        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="np")
        input_processor = processor(images=image_input, return_tensors="np")
        for key in input_feat_extract.keys():
            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "Alexandra，T-shirt的价格是15便士。"
@@ -172,10 +170,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
            self.assertListEqual(encoded_tok[key], encoded_processor[key])
    def test_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "Alexandra，T-shirt的价格是15便士。"
        image_input = self.prepare_image_inputs()
@@ -189,10 +187,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
            processor()
    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
@@ -202,10 +200,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
        self.assertListEqual(decoded_tok, decoded_processor)
    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "Alexandra，T-shirt的价格是15便士。"
        image_input = self.prepare_image_inputs()
--- a/tests/models/clip/test_processor_clip.py
+++ b/tests/models/clip/test_processor_clip.py
@@ -24,13 +24,13 @@ import pytest
 from transformers import CLIPTokenizer, CLIPTokenizerFast
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 if is_vision_available():
    from PIL import Image
-    from transformers import CLIPFeatureExtractor, CLIPProcessor
+    from transformers import CLIPImageProcessor, CLIPProcessor
@require_vision
@@ -52,7 +52,7 @@ class CLIPProcessorTest(unittest.TestCase):
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))
-        feature_extractor_map = {
+        image_processor_map = {
            "do_resize": True,
            "size": 20,
            "do_center_crop": True,
@@ -61,9 +61,9 @@ class CLIPProcessorTest(unittest.TestCase):
            "image_mean": [0.48145466, 0.4578275, 0.40821073],
            "image_std": [0.26862954, 0.26130258, 0.27577711],
        }
-        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(feature_extractor_map, fp)
+            json.dump(image_processor_map, fp)
    def get_tokenizer(self, **kwargs):
        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -71,8 +71,8 @@ class CLIPProcessorTest(unittest.TestCase):
    def get_rust_tokenizer(self, **kwargs):
        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return CLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return CLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -91,13 +91,13 @@ class CLIPProcessorTest(unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer_slow = self.get_tokenizer()
        tokenizer_fast = self.get_rust_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
-        processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
+        processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
        processor_slow.save_pretrained(self.tmpdirname)
        processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-        processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
+        processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
        processor_fast.save_pretrained(self.tmpdirname)
        processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname)
@@ -107,17 +107,17 @@ class CLIPProcessorTest(unittest.TestCase):
        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
-        self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.feature_extractor, CLIPFeatureExtractor)
+        self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
-        self.assertIsInstance(processor_fast.feature_extractor, CLIPFeatureExtractor)
+        self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = CLIPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
        processor.save_pretrained(self.tmpdirname)
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
        processor = CLIPProcessor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -126,28 +126,28 @@ class CLIPProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
-    def test_feature_extractor(self):
+    def test_image_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        image_input = self.prepare_image_inputs()
-        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="np")
        input_processor = processor(images=image_input, return_tensors="np")
-        for key in input_feat_extract.keys():
+        for key in input_image_proc.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
@@ -159,10 +159,10 @@ class CLIPProcessorTest(unittest.TestCase):
            self.assertListEqual(encoded_tok[key], encoded_processor[key])
    def test_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -176,10 +176,10 @@ class CLIPProcessorTest(unittest.TestCase):
            processor()
    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
@@ -189,10 +189,10 @@ class CLIPProcessorTest(unittest.TestCase):
        self.assertListEqual(decoded_tok, decoded_processor)
    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -24,13 +24,13 @@ import pytest
 from transformers import CLIPTokenizer, CLIPTokenizerFast
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 if is_vision_available():
    from PIL import Image
-    from transformers import CLIPSegProcessor, ViTFeatureExtractor
+    from transformers import CLIPSegProcessor, ViTImageProcessor
@require_vision
@@ -52,7 +52,7 @@ class CLIPSegProcessorTest(unittest.TestCase):
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))
-        feature_extractor_map = {
+        image_processor_map = {
            "do_resize": True,
            "size": 20,
            "do_center_crop": True,
@@ -61,9 +61,9 @@ class CLIPSegProcessorTest(unittest.TestCase):
            "image_mean": [0.48145466, 0.4578275, 0.40821073],
            "image_std": [0.26862954, 0.26130258, 0.27577711],
        }
-        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(feature_extractor_map, fp)
+            json.dump(image_processor_map, fp)
    def get_tokenizer(self, **kwargs):
        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -71,8 +71,8 @@ class CLIPSegProcessorTest(unittest.TestCase):
    def get_rust_tokenizer(self, **kwargs):
        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -90,13 +90,13 @@ class CLIPSegProcessorTest(unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer_slow = self.get_tokenizer()
        tokenizer_fast = self.get_rust_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
-        processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
+        processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
        processor_slow.save_pretrained(self.tmpdirname)
        processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-        processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
+        processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
        processor_fast.save_pretrained(self.tmpdirname)
        processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
@@ -106,17 +106,17 @@ class CLIPSegProcessorTest(unittest.TestCase):
        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
-        self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.feature_extractor, ViTFeatureExtractor)
+        self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor)
-        self.assertIsInstance(processor_fast.feature_extractor, ViTFeatureExtractor)
+        self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
        processor.save_pretrained(self.tmpdirname)
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
        processor = CLIPSegProcessor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -125,28 +125,28 @@ class CLIPSegProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, ViTFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
-    def test_feature_extractor(self):
+    def test_image_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
        image_input = self.prepare_image_inputs()
-        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="np")
        input_processor = processor(images=image_input, return_tensors="np")
        for key in input_feat_extract.keys():
            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
@@ -158,10 +158,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
            self.assertListEqual(encoded_tok[key], encoded_processor[key])
    def test_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -175,10 +175,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
            processor()
    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
--- a/tests/models/flava/test_processor_flava.py
+++ b/tests/models/flava/test_processor_flava.py
@@ -25,13 +25,13 @@ import pytest
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 if is_vision_available():
    from PIL import Image
-    from transformers import FlavaFeatureExtractor, FlavaProcessor
+    from transformers import FlavaImageProcessor, FlavaProcessor
    from transformers.models.flava.image_processing_flava import (
        FLAVA_CODEBOOK_MEAN,
        FLAVA_CODEBOOK_STD,
@@ -53,7 +53,7 @@ class FlavaProcessorTest(unittest.TestCase):
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
            fp.write("".join([x + "\n" for x in vocab_tokens]))
-        feature_extractor_map = {
+        image_processor_map = {
            "image_mean": FLAVA_IMAGE_MEAN,
            "image_std": FLAVA_IMAGE_STD,
            "do_normalize": True,
@@ -77,9 +77,9 @@ class FlavaProcessorTest(unittest.TestCase):
            "codebook_image_std": FLAVA_CODEBOOK_STD,
        }
-        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(feature_extractor_map, fp)
+            json.dump(image_processor_map, fp)
    def get_tokenizer(self, **kwargs):
        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -87,8 +87,8 @@ class FlavaProcessorTest(unittest.TestCase):
    def get_rust_tokenizer(self, **kwargs):
        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return FlavaFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -107,13 +107,13 @@ class FlavaProcessorTest(unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer_slow = self.get_tokenizer()
        tokenizer_fast = self.get_rust_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
-        processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
+        processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
        processor_slow.save_pretrained(self.tmpdirname)
        processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-        processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
+        processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
        processor_fast.save_pretrained(self.tmpdirname)
        processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
@@ -123,17 +123,17 @@ class FlavaProcessorTest(unittest.TestCase):
        self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
        self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
-        self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.feature_extractor, FlavaFeatureExtractor)
+        self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor)
-        self.assertIsInstance(processor_fast.feature_extractor, FlavaFeatureExtractor)
+        self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = FlavaProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
        processor.save_pretrained(self.tmpdirname)
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
        processor = FlavaProcessor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -142,18 +142,18 @@ class FlavaProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, FlavaFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, FlavaImageProcessor)
-    def test_feature_extractor(self):
+    def test_image_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
        image_input = self.prepare_image_inputs()
-        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="np")
        input_processor = processor(images=image_input, return_tensors="np")
        for key in input_feat_extract.keys():
@@ -161,7 +161,7 @@ class FlavaProcessorTest(unittest.TestCase):
        # With rest of the args
        random.seed(1234)
-        input_feat_extract = feature_extractor(
+        input_feat_extract = image_processor(
            image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
        )
        random.seed(1234)
@@ -173,10 +173,10 @@ class FlavaProcessorTest(unittest.TestCase):
            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
@@ -188,10 +188,10 @@ class FlavaProcessorTest(unittest.TestCase):
            self.assertListEqual(encoded_tok[key], encoded_processor[key])
    def test_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -220,10 +220,10 @@ class FlavaProcessorTest(unittest.TestCase):
            processor()
    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
@@ -233,10 +233,10 @@ class FlavaProcessorTest(unittest.TestCase):
        self.assertListEqual(decoded_tok, decoded_processor)
    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
 if is_pytesseract_available():
    from PIL import Image
-    from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
+    from transformers import LayoutLMv2ImageProcessor, LayoutLMv2Processor
@require_pytesseract
@@ -59,7 +59,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
            "lowest",
        ]
-        feature_extractor_map = {
+        image_processor_map = {
            "do_resize": True,
            "size": 224,
            "apply_ocr": True,
@@ -69,9 +69,9 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        self.image_processing_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
+        with open(self.image_processing_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
+            fp.write(json.dumps(image_processor_map) + "\n")
    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@@ -82,8 +82,8 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
    def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
        return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -100,10 +100,10 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
        return image_inputs
    def test_save_load_pretrained_default(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
-            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
            processor.save_pretrained(self.tmpdirname)
            processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
@@ -111,16 +111,16 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
            self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
            self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
-            self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-            self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+            self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
+        processor = LayoutLMv2Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
        processor.save_pretrained(self.tmpdirname)
        # slow tokenizer
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
        processor = LayoutLMv2Processor.from_pretrained(
            self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
@@ -129,12 +129,12 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
        # fast tokenizer
        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
        processor = LayoutLMv2Processor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
@@ -143,14 +143,14 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = LayoutLMv2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = LayoutLMv2Processor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -220,15 +220,15 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_1(self):
        # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
-        feature_extractor = LayoutLMv2FeatureExtractor()
+        image_processor = LayoutLMv2ImageProcessor()
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
-            input_feat_extract = feature_extractor(images[0], return_tensors="pt")
+            input_image_proc = image_processor(images[0], return_tensors="pt")
            input_processor = processor(images[0], return_tensors="pt")
            # verify keys
@@ -237,9 +237,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
            self.assertListEqual(actual_keys, expected_keys)
            # verify image
-            self.assertAlmostEqual(
+            self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
                input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
            )
            # verify input_ids
            # this was obtained with Tesseract 4.1.1
@@ -250,7 +248,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
            self.assertSequenceEqual(decoding, expected_decoding)
            # batched
-            input_feat_extract = feature_extractor(images, return_tensors="pt")
+            input_image_proc = image_processor(images, return_tensors="pt")
            input_processor = processor(images, padding=True, return_tensors="pt")
            # verify keys
@@ -259,9 +257,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
            self.assertListEqual(actual_keys, expected_keys)
            # verify images
-            self.assertAlmostEqual(
+            self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
                input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
            )
            # verify input_ids
            # this was obtained with Tesseract 4.1.1
@@ -275,12 +271,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_2(self):
        # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            words = ["hello", "world"]
@@ -329,12 +325,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_3(self):
        # case 3: token classification (training), apply_ocr=False
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            words = ["weirdly", "world"]
@@ -394,12 +390,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_4(self):
        # case 4: visual question answering (inference), apply_ocr=True
-        feature_extractor = LayoutLMv2FeatureExtractor()
+        image_processor = LayoutLMv2ImageProcessor()
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            question = "What's his name?"
@@ -445,12 +441,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_5(self):
        # case 5: visual question answering (inference), apply_ocr=False
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            question = "What's his name?"
--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
 if is_pytesseract_available():
    from PIL import Image
-    from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3Processor
+    from transformers import LayoutLMv3ImageProcessor, LayoutLMv3Processor
@require_pytesseract
@@ -76,7 +76,7 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))
-        feature_extractor_map = {
+        image_processor_map = {
            "do_resize": True,
            "size": 224,
            "apply_ocr": True,
@@ -84,7 +84,7 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
+            fp.write(json.dumps(image_processor_map) + "\n")
    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@@ -95,8 +95,8 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
    def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
        return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return LayoutLMv3FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -113,10 +113,10 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
        return image_inputs
    def test_save_load_pretrained_default(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
-            processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
            processor.save_pretrained(self.tmpdirname)
            processor = LayoutLMv3Processor.from_pretrained(self.tmpdirname)
@@ -124,16 +124,16 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
            self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
            self.assertIsInstance(processor.tokenizer, (LayoutLMv3Tokenizer, LayoutLMv3TokenizerFast))
-            self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-            self.assertIsInstance(processor.feature_extractor, LayoutLMv3FeatureExtractor)
+            self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = LayoutLMv3Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
+        processor = LayoutLMv3Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
        processor.save_pretrained(self.tmpdirname)
        # slow tokenizer
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
        processor = LayoutLMv3Processor.from_pretrained(
            self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
@@ -142,12 +142,12 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutLMv3Tokenizer)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv3FeatureExtractor)
+        self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
        # fast tokenizer
        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
        processor = LayoutLMv3Processor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
@@ -156,14 +156,14 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutLMv3TokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv3FeatureExtractor)
+        self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = LayoutLMv3Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = LayoutLMv3Processor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -200,15 +200,15 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_1(self):
        # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
-        feature_extractor = LayoutLMv3FeatureExtractor()
+        image_processor = LayoutLMv3ImageProcessor()
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
-            input_feat_extract = feature_extractor(images[0], return_tensors="pt")
+            input_image_proc = image_processor(images[0], return_tensors="pt")
            input_processor = processor(images[0], return_tensors="pt")
            # verify keys
@@ -218,7 +218,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
            # verify image
            self.assertAlmostEqual(
-                input_feat_extract["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
+                input_image_proc["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
            )
            # verify input_ids
@@ -230,7 +230,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
            self.assertSequenceEqual(decoding, expected_decoding)
            # batched
-            input_feat_extract = feature_extractor(images, return_tensors="pt")
+            input_image_proc = image_processor(images, return_tensors="pt")
            input_processor = processor(images, padding=True, return_tensors="pt")
            # verify keys
@@ -240,7 +240,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
            # verify images
            self.assertAlmostEqual(
-                input_feat_extract["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
+                input_image_proc["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
            )
            # verify input_ids
@@ -255,12 +255,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_2(self):
        # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
-        feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            words = ["hello", "world"]
@@ -309,12 +309,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_3(self):
        # case 3: token classification (training), apply_ocr=False
-        feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            words = ["weirdly", "world"]
@@ -374,12 +374,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_4(self):
        # case 4: visual question answering (inference), apply_ocr=True
-        feature_extractor = LayoutLMv3FeatureExtractor()
+        image_processor = LayoutLMv3ImageProcessor()
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            question = "What's his name?"
@@ -425,12 +425,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
    def test_processor_case_5(self):
        # case 5: visual question answering (inference), apply_ocr=False
-        feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images
        for tokenizer in tokenizers:
-            processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
            # not batched
            question = "What's his name?"
--- a/tests/models/owlvit/test_processor_owlvit.py
+++ b/tests/models/owlvit/test_processor_owlvit.py
@@ -24,13 +24,13 @@ import pytest
 from transformers import CLIPTokenizer, CLIPTokenizerFast
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 if is_vision_available():
    from PIL import Image
-    from transformers import OwlViTFeatureExtractor, OwlViTProcessor
+    from transformers import OwlViTImageProcessor, OwlViTProcessor
@require_vision
@@ -52,7 +52,7 @@ class OwlViTProcessorTest(unittest.TestCase):
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))
-        feature_extractor_map = {
+        image_processor_map = {
            "do_resize": True,
            "size": 20,
            "do_center_crop": True,
@@ -61,9 +61,9 @@ class OwlViTProcessorTest(unittest.TestCase):
            "image_mean": [0.48145466, 0.4578275, 0.40821073],
            "image_std": [0.26862954, 0.26130258, 0.27577711],
        }
-        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(feature_extractor_map, fp)
+            json.dump(image_processor_map, fp)
    def get_tokenizer(self, **kwargs):
        return CLIPTokenizer.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
@@ -71,8 +71,8 @@ class OwlViTProcessorTest(unittest.TestCase):
    def get_rust_tokenizer(self, **kwargs):
        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
-    def get_feature_extractor(self, **kwargs):
+    def get_image_processor(self, **kwargs):
-        return OwlViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return OwlViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
@@ -91,13 +91,13 @@ class OwlViTProcessorTest(unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer_slow = self.get_tokenizer()
        tokenizer_fast = self.get_rust_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
-        processor_slow = OwlViTProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
+        processor_slow = OwlViTProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
        processor_slow.save_pretrained(self.tmpdirname)
        processor_slow = OwlViTProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-        processor_fast = OwlViTProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
+        processor_fast = OwlViTProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
        processor_fast.save_pretrained(self.tmpdirname)
        processor_fast = OwlViTProcessor.from_pretrained(self.tmpdirname)
@@ -107,17 +107,17 @@ class OwlViTProcessorTest(unittest.TestCase):
        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
-        self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.feature_extractor, OwlViTFeatureExtractor)
+        self.assertIsInstance(processor_slow.image_processor, OwlViTImageProcessor)
-        self.assertIsInstance(processor_fast.feature_extractor, OwlViTFeatureExtractor)
+        self.assertIsInstance(processor_fast.image_processor, OwlViTImageProcessor)
    def test_save_load_pretrained_additional_features(self):
-        processor = OwlViTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor = OwlViTProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
        processor.save_pretrained(self.tmpdirname)
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False)
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
        processor = OwlViTProcessor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False
@@ -126,28 +126,28 @@ class OwlViTProcessorTest(unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, OwlViTFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, OwlViTImageProcessor)
-    def test_feature_extractor(self):
+    def test_image_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
        image_input = self.prepare_image_inputs()
-        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="np")
        input_processor = processor(images=image_input, return_tensors="np")
-        for key in input_feat_extract.keys():
+        for key in input_image_proc.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
@@ -159,10 +159,10 @@ class OwlViTProcessorTest(unittest.TestCase):
            self.assertListEqual(encoded_tok[key][0].tolist(), encoded_processor[key][0].tolist())
    def test_processor(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -228,10 +228,10 @@ class OwlViTProcessorTest(unittest.TestCase):
        self.assertListEqual(list(input_ids[1]), predicted_ids[1])
    def test_processor_case2(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
        image_input = self.prepare_image_inputs()
        query_input = self.prepare_image_inputs()
@@ -245,10 +245,10 @@ class OwlViTProcessorTest(unittest.TestCase):
            processor()
    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
-        processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]