From ea219ed164bead55a5513e8cfaa17a25d5613b9e Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Wed, 12 Mar 2025 19:44:05 -0400 Subject: [PATCH] Remove differences between init and preprocess kwargs for fast image processors (#36186) * Remove differences between init and preprocess kwargs in fast image processors * make modifs got_ocr2 * update gemma3 --- .../image_processing_utils_fast.py | 53 +++++++++++-------- .../image_processing_convnext_fast.py | 18 ++----- .../image_processing_deformable_detr_fast.py | 29 +++++----- .../models/detr/image_processing_detr_fast.py | 31 +++++------ .../gemma3/image_processing_gemma3_fast.py | 25 +++------ .../image_processing_got_ocr2_fast.py | 18 ++----- .../llava/image_processing_llava_fast.py | 18 ++----- .../image_processing_llava_next_fast.py | 19 ++----- .../image_processing_llava_onevision_fast.py | 19 ++----- .../pixtral/image_processing_pixtral_fast.py | 18 ++----- .../image_processing_qwen2_vl_fast.py | 8 +-- .../rt_detr/image_processing_rt_detr_fast.py | 29 +++++----- .../models/rt_detr/modular_rt_detr.py | 22 ++++---- tests/test_image_processing_common.py | 24 ++++++--- utils/modular_model_converter.py | 3 +- 15 files changed, 136 insertions(+), 198 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index f990ce100d..a87db33704 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -126,7 +126,7 @@ def divide_to_patches( return patches -class DefaultFastImageProcessorInitKwargs(TypedDict, total=False): +class DefaultFastImageProcessorKwargs(TypedDict, total=False): do_resize: Optional[bool] size: Optional[Dict[str, int]] default_to_square: Optional[bool] @@ -139,9 +139,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False): image_mean: Optional[Union[float, List[float]]] image_std: Optional[Union[float, List[float]]] do_convert_rgb: Optional[bool] - - -class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs): return_tensors: Optional[Union[str, TensorType]] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] @@ -185,8 +182,20 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r""" Standard deviation to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method. - do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`): - Whether to convert the image to RGB.""" + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`): + Returns stacked tensors if set to `pt, otherwise returns a list of tensors. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`): + Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors. + input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + device (`torch.device`, *optional*, defaults to `self.device`): + The device to process the images on. If unset, the device is inferred from the input images.""" BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r""" Preprocess an image or batch of images. @@ -219,20 +228,17 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r""" `True`. do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): Whether to convert the image to RGB. - return_tensors (`str` or `TensorType`, *optional*): + return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`): Returns stacked tensors if set to `pt, otherwise returns a list of tensors. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): + data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`): + Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors. + input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`): The channel dimension format for the input image. If unset, the channel dimension format is inferred from the input image. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - device (`torch.device`, *optional*): + device (`torch.device`, *optional*, defaults to `self.device`): The device to process the images on. If unset, the device is inferred from the input images.""" @@ -253,13 +259,16 @@ class BaseImageProcessorFast(BaseImageProcessor): rescale_factor = 1 / 255 do_normalize = None do_convert_rgb = None + return_tensors = None + data_format = ChannelDimension.FIRST + input_data_format = None + device = None model_input_names = ["pixel_values"] - valid_init_kwargs = DefaultFastImageProcessorInitKwargs - valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs + valid_kwargs = DefaultFastImageProcessorKwargs def __init__( self, - **kwargs: Unpack[DefaultFastImageProcessorInitKwargs], + **kwargs: Unpack[DefaultFastImageProcessorKwargs], ) -> None: super().__init__(**kwargs) size = kwargs.pop("size", self.size) @@ -270,7 +279,7 @@ class BaseImageProcessorFast(BaseImageProcessor): ) crop_size = kwargs.pop("crop_size", self.crop_size) self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None - for key in self.valid_init_kwargs.__annotations__.keys(): + for key in self.valid_kwargs.__annotations__.keys(): kwarg = kwargs.pop(key, None) if kwarg is not None: setattr(self, key, kwarg) @@ -553,14 +562,12 @@ class BaseImageProcessorFast(BaseImageProcessor): def preprocess( self, images: ImageInput, - **kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs], + **kwargs: Unpack[DefaultFastImageProcessorKwargs], ) -> BatchFeature: - validate_kwargs( - captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys() - ) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys()) # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. - for kwarg_name in self.valid_preprocess_kwargs.__annotations__: + for kwarg_name in self.valid_kwargs.__annotations__: kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None)) # Extract parameters that are only used for preparing the input images diff --git a/src/transformers/models/convnext/image_processing_convnext_fast.py b/src/transformers/models/convnext/image_processing_convnext_fast.py index c2a8e37d53..19f959f07d 100644 --- a/src/transformers/models/convnext/image_processing_convnext_fast.py +++ b/src/transformers/models/convnext/image_processing_convnext_fast.py @@ -21,8 +21,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -54,11 +53,7 @@ if is_torchvision_available(): from torchvision.transforms import functional as F -class ConvNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - crop_pct: Optional[float] - - -class ConvNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): crop_pct: Optional[float] @@ -81,10 +76,9 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True crop_pct = 224 / 256 - valid_init_kwargs = ConvNextFastImageProcessorInitKwargs - valid_preprocess_kwargs = ConvNextFastImageProcessorPreprocessKwargs + valid_kwargs = ConvNextFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]): super().__init__(**kwargs) @add_start_docstrings( @@ -95,9 +89,7 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast): overridden by `crop_pct` in the`preprocess` method. """, ) - def preprocess( - self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorPreprocessKwargs] - ) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py index 2aee1802ce..850370e593 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py @@ -12,8 +12,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -58,21 +57,12 @@ elif is_torchvision_available(): logger = logging.get_logger(__name__) -class DeformableDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): +class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): format: Optional[Union[str, AnnotationFormat]] do_convert_annotations: Optional[bool] do_pad: Optional[bool] pad_size: Optional[Dict[str, int]] - - -class DeformableDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): - format: Optional[AnnotationFormat] - annotations: Optional[Dict] - do_convert_annotations: Optional[bool] - do_pad: Optional[bool] - pad_size: Optional[Dict[str, int]] return_segmentation_masks: Optional[bool] - masks_path: Optional[Union[str, pathlib.Path]] SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -294,6 +284,8 @@ def prepare_coco_panoptic_annotation( The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. """, ) class DeformableDetrImageProcessorFast(BaseImageProcessorFast): @@ -308,10 +300,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_init_kwargs = DeformableDetrFastImageProcessorInitKwargs - valid_preprocess_kwargs = DeformableDetrFastImageProcessorPreprocessKwargs + valid_kwargs = DeformableDetrFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorInitKwargs]) -> None: + def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -605,7 +596,11 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast): """, ) def preprocess( - self, images: ImageInput, **kwargs: Unpack[DeformableDetrFastImageProcessorPreprocessKwargs] + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs], ) -> BatchFeature: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -621,7 +616,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast): ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, **kwargs) + return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index e49b176167..8d29a5796f 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -24,8 +24,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -283,21 +282,12 @@ def prepare_coco_panoptic_annotation( return new_target -class DetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): +class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): format: Optional[Union[str, AnnotationFormat]] do_convert_annotations: Optional[bool] do_pad: Optional[bool] pad_size: Optional[Dict[str, int]] - - -class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): - format: Optional[AnnotationFormat] - annotations: Optional[Dict] - do_convert_annotations: Optional[bool] - do_pad: Optional[bool] - pad_size: Optional[Dict[str, int]] return_segmentation_masks: Optional[bool] - masks_path: Optional[Union[str, pathlib.Path]] @add_start_docstrings( @@ -319,6 +309,8 @@ class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocess The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. """, ) class DetrImageProcessorFast(BaseImageProcessorFast): @@ -333,10 +325,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_init_kwargs = DetrFastImageProcessorInitKwargs - valid_preprocess_kwargs = DetrFastImageProcessorPreprocessKwargs + valid_kwargs = DetrFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[DetrFastImageProcessorInitKwargs]) -> None: + def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -629,7 +620,13 @@ class DetrImageProcessorFast(BaseImageProcessorFast): Path to the directory containing the segmentation masks. """, ) - def preprocess(self, images: ImageInput, **kwargs: Unpack[DetrFastImageProcessorPreprocessKwargs]) -> BatchFeature: + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[DetrFastImageProcessorKwargs], + ) -> BatchFeature: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") logger.warning_once( @@ -644,7 +641,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast): ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, **kwargs) + return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py index 0a26f25231..50dfcb920f 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py +++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py @@ -24,8 +24,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, get_size_dict, group_images_by_shape, reorder_images, @@ -67,14 +66,7 @@ if is_torchvision_available(): logger = logging.get_logger(__name__) -class Gemma3FastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - do_pan_and_scan: Optional[bool] - pan_and_scan_min_crop_size: Optional[int] - pan_and_scan_max_num_crops: Optional[int] - pan_and_scan_min_ratio_to_activate: Optional[float] - - -class Gemma3FastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): do_pan_and_scan: Optional[bool] pan_and_scan_min_crop_size: Optional[int] pan_and_scan_max_num_crops: Optional[int] @@ -108,10 +100,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast): pan_and_scan_min_crop_size = None pan_and_scan_max_num_crops = None pan_and_scan_min_ratio_to_activate = None - valid_init_kwargs = Gemma3FastImageProcessorInitKwargs - valid_preprocess_kwargs = Gemma3FastImageProcessorPreprocessKwargs + valid_kwargs = Gemma3FastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]): super().__init__(**kwargs) def _prepare_images_structure( @@ -262,14 +253,12 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast): def preprocess( self, images: ImageInput, - **kwargs: Unpack[Gemma3FastImageProcessorPreprocessKwargs], + **kwargs: Unpack[Gemma3FastImageProcessorKwargs], ) -> BatchFeature: - validate_kwargs( - captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys() - ) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys()) # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. - for kwarg_name in self.valid_preprocess_kwargs.__annotations__: + for kwarg_name in self.valid_kwargs.__annotations__: kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None)) # Extract parameters that are only used for preparing the input images diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py index 5103f73b11..8498e37803 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py @@ -21,8 +21,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -54,13 +53,7 @@ if is_torchvision_available(): from torchvision.transforms import functional as F -class GotOcr2ImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] - - -class GotOcr2ImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class GotOcr2ImageProcessorKwargs(DefaultFastImageProcessorKwargs): crop_to_patches: Optional[bool] min_patches: Optional[int] max_patches: Optional[int] @@ -93,10 +86,9 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast): crop_to_patches = False min_patches = 1 max_patches = 12 - valid_init_kwargs = GotOcr2ImageProcessorInitKwargs - valid_preprocess_kwargs = GotOcr2ImageProcessorPreprocessKwargs + valid_kwargs = GotOcr2ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[valid_kwargs]): super().__init__(**kwargs) @add_start_docstrings( @@ -113,7 +105,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast): set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. """, ) - def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorPreprocessKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[valid_kwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def crop_image_to_patches( diff --git a/src/transformers/models/llava/image_processing_llava_fast.py b/src/transformers/models/llava/image_processing_llava_fast.py index e582336e97..d85eb89b7c 100644 --- a/src/transformers/models/llava/image_processing_llava_fast.py +++ b/src/transformers/models/llava/image_processing_llava_fast.py @@ -23,8 +23,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -61,11 +60,7 @@ if is_torchvision_available(): from torchvision.transforms import functional as F -class LlavaFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - do_pad: Optional[bool] - - -class LlavaFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): do_pad: Optional[bool] @@ -90,10 +85,9 @@ class LlavaImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True - valid_init_kwargs = LlavaFastImageProcessorInitKwargs - valid_preprocess_kwargs = LlavaFastImageProcessorPreprocessKwargs + valid_kwargs = LlavaFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorInitKwargs]) -> None: + def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None: super().__init__(**kwargs) @add_start_docstrings( @@ -103,9 +97,7 @@ class LlavaImageProcessorFast(BaseImageProcessorFast): Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter """, ) - def preprocess( - self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorPreprocessKwargs] - ) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def pad_to_square( diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py index 1323f303b0..d4caf2a19a 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py +++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py @@ -21,8 +21,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, divide_to_patches, group_images_by_shape, reorder_images, @@ -57,12 +56,7 @@ if is_torchvision_available(): from torchvision.transforms import functional as F -class LlavaNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - image_grid_pinpoints: Optional[List[List[int]]] - do_pad: Optional[bool] - - -class LlavaNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): image_grid_pinpoints: Optional[List[List[int]]] do_pad: Optional[bool] @@ -96,10 +90,9 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast): do_convert_rgb = True do_pad = True image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] - valid_init_kwargs = LlavaNextFastImageProcessorInitKwargs - valid_preprocess_kwargs = LlavaNextFastImageProcessorPreprocessKwargs + valid_kwargs = LlavaNextFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]): super().__init__(**kwargs) @add_start_docstrings( @@ -113,9 +106,7 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast): number of patches in the batch. Padding will be applied to the bottom and right with zeros. """, ) - def preprocess( - self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorPreprocessKwargs] - ) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _prepare_images_structure( diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py index 13aa265496..598ac78f53 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py @@ -12,8 +12,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, divide_to_patches, group_images_by_shape, reorder_images, @@ -40,12 +39,7 @@ else: from torchvision.transforms import functional as F -class LlavaOnevisionFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - image_grid_pinpoints: Optional[List[List[int]]] - do_pad: Optional[bool] - - -class LlavaOnevisionFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): image_grid_pinpoints: Optional[List[List[int]]] do_pad: Optional[bool] @@ -77,11 +71,10 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): do_convert_rgb = True do_pad = True image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip - valid_init_kwargs = LlavaOnevisionFastImageProcessorInitKwargs - valid_preprocess_kwargs = LlavaOnevisionFastImageProcessorPreprocessKwargs + valid_kwargs = LlavaOnevisionFastImageProcessorKwargs model_input_names = ["pixel_values_videos"] - def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]): super().__init__(**kwargs) @add_start_docstrings( @@ -95,9 +88,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): number of patches in the batch. Padding will be applied to the bottom and right with zeros. """, ) - def preprocess( - self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorPreprocessKwargs] - ) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _prepare_images_structure( diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py index f76fe4a716..0cb4673038 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py +++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py @@ -21,8 +21,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -61,11 +60,7 @@ if is_torchvision_available(): from torchvision.transforms import functional as F -class PixtralFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): - patch_size: Optional[Dict[str, int]] - - -class PixtralFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): +class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): patch_size: Optional[Dict[str, int]] @@ -88,10 +83,9 @@ class PixtralImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True - valid_init_kwargs = PixtralFastImageProcessorInitKwargs - valid_preprocess_kwargs = PixtralFastImageProcessorPreprocessKwargs + valid_kwargs = PixtralFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]): super().__init__(**kwargs) @add_start_docstrings( @@ -101,9 +95,7 @@ class PixtralImageProcessorFast(BaseImageProcessorFast): Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. """, ) - def preprocess( - self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorPreprocessKwargs] - ) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index 8f4c233c0b..b54f86bba6 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -25,7 +25,7 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, + DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -69,7 +69,7 @@ elif is_torchvision_available(): logger = logging.get_logger(__name__) -class Qwen2VLFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): +class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] @@ -107,10 +107,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): merge_size = 2 min_pixels = 56 * 56 max_pixels = 28 * 28 * 1280 - valid_init_kwargs = Qwen2VLFastImageProcessorInitKwargs + valid_kwargs = DefaultFastImageProcessorKwargs model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] - def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]): super().__init__(**kwargs) def _preprocess( diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py index 0c9b4512ad..bd34843645 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -12,8 +12,7 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BaseImageProcessorFast, - DefaultFastImageProcessorInitKwargs, - DefaultFastImageProcessorPreprocessKwargs, + DefaultFastImageProcessorKwargs, SizeDict, add_start_docstrings, get_image_size_for_max_height_width, @@ -53,21 +52,12 @@ elif is_torchvision_available(): from torchvision.transforms import functional as F -class RTDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): +class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): format: Optional[Union[str, AnnotationFormat]] do_convert_annotations: Optional[bool] do_pad: Optional[bool] pad_size: Optional[Dict[str, int]] - - -class RTDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs): - format: Optional[AnnotationFormat] - annotations: Optional[Dict] - do_convert_annotations: Optional[bool] - do_pad: Optional[bool] - pad_size: Optional[Dict[str, int]] return_segmentation_masks: Optional[bool] - masks_path: Optional[Union[str, pathlib.Path]] SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -151,6 +141,8 @@ def prepare_coco_detection_annotation( The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. """, ) class RTDetrImageProcessorFast(BaseImageProcessorFast): @@ -165,11 +157,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast): size = {"height": 640, "width": 640} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_init_kwargs = RTDetrFastImageProcessorInitKwargs - valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs + valid_kwargs = RTDetrFastImageProcessorKwargs do_convert_annotations = True - def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None: + def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None: # Backwards compatibility do_convert_annotations = kwargs.get("do_convert_annotations", None) do_normalize = kwargs.get("do_normalize", None) @@ -424,9 +415,13 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast): """, ) def preprocess( - self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs] + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[RTDetrFastImageProcessorKwargs], ) -> BatchFeature: - return super().preprocess(images, **kwargs) + return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py index 101d02c021..e1ee97b4da 100644 --- a/src/transformers/models/rt_detr/modular_rt_detr.py +++ b/src/transformers/models/rt_detr/modular_rt_detr.py @@ -2,8 +2,7 @@ import pathlib from typing import Dict, List, Optional, Tuple, Union from transformers.models.detr.image_processing_detr_fast import ( - DetrFastImageProcessorInitKwargs, - DetrFastImageProcessorPreprocessKwargs, + DetrFastImageProcessorKwargs, DetrImageProcessorFast, ) @@ -112,11 +111,7 @@ def prepare_coco_detection_annotation( return new_target -class RTDetrFastImageProcessorInitKwargs(DetrFastImageProcessorInitKwargs): - pass - - -class RTDetrFastImageProcessorPreprocessKwargs(DetrFastImageProcessorPreprocessKwargs): +class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs): pass @@ -133,10 +128,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast): size = {"height": 640, "width": 640} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_init_kwargs = RTDetrFastImageProcessorInitKwargs - valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs + valid_kwargs = RTDetrFastImageProcessorKwargs - def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None: + def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None: # Backwards compatibility do_convert_annotations = kwargs.get("do_convert_annotations", None) do_normalize = kwargs.get("do_normalize", None) @@ -181,9 +175,13 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast): """, ) def preprocess( - self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs] + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[RTDetrFastImageProcessorKwargs], ) -> BatchFeature: - return BaseImageProcessorFast().preprocess(images, **kwargs) + return BaseImageProcessorFast().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs) def prepare_annotation( self, diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index 0df30adada..f42cd6847b 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -311,8 +311,10 @@ class ImageProcessingTestMixin: } dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)} dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)} - # check that all additional keys are None, except for `default_to_square` which is only set in fast processors - self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) + # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors + self.assertTrue( + all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"]) + ) # check that the remaining keys are the same self.assertEqual(dict_slow_0, dict_slow_1) @@ -324,8 +326,10 @@ class ImageProcessingTestMixin: } dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)} dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)} - # check that all additional keys are None, except for `default_to_square` which is only set in fast processors - self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) + # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors + self.assertTrue( + all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"]) + ) # check that the remaining keys are the same self.assertEqual(dict_fast_0, dict_fast_1) @@ -357,8 +361,10 @@ class ImageProcessingTestMixin: } dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)} dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)} - # check that all additional keys are None, except for `default_to_square` which is only set in fast processors - self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) + # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors + self.assertTrue( + all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"]) + ) # check that the remaining keys are the same self.assertEqual(dict_slow_0, dict_slow_1) @@ -370,8 +376,10 @@ class ImageProcessingTestMixin: } dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)} dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)} - # check that all additional keys are None, except for `default_to_square` which is only set in fast processors - self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) + # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors + self.assertTrue( + all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"]) + ) # check that the remaining keys are the same self.assertEqual(dict_fast_0, dict_fast_1) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 728c5628b1..efb8ac0047 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1087,8 +1087,7 @@ TYPE_TO_FILE_TYPE = { "Processor": "processing", "ImageProcessor": "image_processing", "ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix - "FastImageProcessorInitKwargs": "image_processing*_fast", - "FastImageProcessorPreprocessKwargs": "image_processing*_fast", + "FastImageProcessorKwargs": "image_processing*_fast", "FeatureExtractor": "feature_extractor", "ProcessorKwargs": "processing", "ImagesKwargs": "processing",