Remove hardcoded slow image processor class in processors supporting fast ones (#36266)

* Add fast image processor class to processors supporting them * fix test kosmos2
2025-03-12 18:39:25 -04:00
parent 0013ba61e5
commit bc3253f076
16 changed files with 31 additions and 18 deletions
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -44,7 +44,7 @@ class AltCLIPProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")

    @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -490,7 +490,7 @@ class AutoImageProcessor:
                image_processor_auto_map = config.auto_map["AutoImageProcessor"]

        image_processor_class = None
-        # TODO: @yoni, change logic in v4.48 (when use_fast set to True by default)
+        # TODO: @yoni, change logic in v4.50 (when use_fast set to True by default)
        if image_processor_type is not None:
            # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
            if use_fast is None:
@@ -498,7 +498,7 @@ class AutoImageProcessor:
                if not use_fast:
                    logger.warning_once(
                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                        "`use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. "
+                        "`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. "
                        "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
                    )
            # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version.
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -56,7 +56,7 @@ class BlipProcessor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = []
-    image_processor_class = "BlipImageProcessor"
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

    def __init__(self, image_processor, tokenizer, **kwargs):
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -68,7 +68,7 @@ class Blip2Processor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["num_query_tokens"]
-    image_processor_class = "BlipImageProcessor"
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -37,7 +37,7 @@ class CLIPProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -37,7 +37,7 @@ class CLIPSegProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "ViTImageProcessor"
+    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -91,7 +91,7 @@ class ColPaliProcessor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["chat_template"]
-    image_processor_class = "SiglipImageProcessor"
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")

    visual_prompt_prefix: ClassVar[str] = "Describe the image."
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -73,7 +73,7 @@ class InstructBlipProcessor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer", "qformer_tokenizer"]
    valid_kwargs = ["num_query_tokens"]
-    image_processor_class = "BlipImageProcessor"
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
    tokenizer_class = "AutoTokenizer"
    qformer_tokenizer_class = "AutoTokenizer"

--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -85,7 +85,7 @@ class Kosmos2Processor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["num_patch_index_tokens"]
-    image_processor_class = "CLIPImageProcessor"
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -85,7 +85,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
        "video_token",
        "num_additional_image_tokens",
    ]
-    image_processor_class = "LlavaNextImageProcessor"
+    image_processor_class = ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")
    video_processor_class = "LlavaNextVideoImageProcessor"
    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")

--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -51,7 +51,7 @@ class MgpstrProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "char_tokenizer"]
-    image_processor_class = "ViTImageProcessor"
+    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
    char_tokenizer_class = "MgpstrTokenizer"

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -216,7 +216,7 @@ class OmDetTurboProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "DetrImageProcessor"
+    image_processor_class = ("DetrImageProcessor", "DetrImageProcessorFast")
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor, tokenizer):
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -117,7 +117,7 @@ class PaliGemmaProcessor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["chat_template"]
-    image_processor_class = "SiglipImageProcessor"
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")

    def __init__(
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -40,7 +40,7 @@ class SiglipProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "SiglipImageProcessor"
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor, tokenizer):
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1105,6 +1105,16 @@ class ProcessorMixin(PushToHubMixin):
            class_name = getattr(cls, f"{attribute_name}_class")
            if isinstance(class_name, tuple):
                classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+                if attribute_name == "image_processor":
+                    # TODO: @yoni, change logic in v4.50 (when use_fast set to True by default)
+                    use_fast = kwargs.get("use_fast", None)
+                    if use_fast is None:
+                        logger.warning_once(
+                            "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
+                            "`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. "
+                            "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
+                        )
+                else:
                    use_fast = kwargs.get("use_fast", True)
                if use_fast and classes[1] is not None:
                    attribute_class = classes[1]
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -70,11 +70,14 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = Kosmos2Processor(image_processor, fast_tokenizer)
        processor.save_pretrained(self.tmpdirname)

-    # We override this method to take the fast tokenizer or image processor by default
+    # We override this method to take the fast tokenizer by default
    def get_component(self, attribute, **kwargs):
        assert attribute in self.processor_class.attributes
        component_class_name = getattr(self.processor_class, f"{attribute}_class")
        if isinstance(component_class_name, tuple):
+            if attribute == "image_processor":
+                component_class_name = component_class_name[0]
+            else:
                component_class_name = component_class_name[-1]

        component_class = processor_class_from_name(component_class_name)