Remove differences between init and preprocess kwargs for fast image processors (#36186)

* Remove differences between init and preprocess kwargs in fast image processors

* make modifs got_ocr2

* update gemma3
This commit is contained in:
Yoni Gozlan
2025-03-12 19:44:05 -04:00
committed by GitHub
parent cc3a361b46
commit ea219ed164
15 changed files with 136 additions and 198 deletions

View File

@@ -126,7 +126,7 @@ def divide_to_patches(
return patches
class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
class DefaultFastImageProcessorKwargs(TypedDict, total=False):
do_resize: Optional[bool]
size: Optional[Dict[str, int]]
default_to_square: Optional[bool]
@@ -139,9 +139,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
image_mean: Optional[Union[float, List[float]]]
image_std: Optional[Union[float, List[float]]]
do_convert_rgb: Optional[bool]
class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs):
return_tensors: Optional[Union[str, TensorType]]
data_format: Optional[ChannelDimension]
input_data_format: Optional[Union[str, ChannelDimension]]
@@ -185,8 +182,20 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
Whether to convert the image to RGB."""
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
device (`torch.device`, *optional*, defaults to `self.device`):
The device to process the images on. If unset, the device is inferred from the input images."""
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
Preprocess an image or batch of images.
@@ -219,20 +228,17 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
device (`torch.device`, *optional*):
device (`torch.device`, *optional*, defaults to `self.device`):
The device to process the images on. If unset, the device is inferred from the input images."""
@@ -253,13 +259,16 @@ class BaseImageProcessorFast(BaseImageProcessor):
rescale_factor = 1 / 255
do_normalize = None
do_convert_rgb = None
return_tensors = None
data_format = ChannelDimension.FIRST
input_data_format = None
device = None
model_input_names = ["pixel_values"]
valid_init_kwargs = DefaultFastImageProcessorInitKwargs
valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs
valid_kwargs = DefaultFastImageProcessorKwargs
def __init__(
self,
**kwargs: Unpack[DefaultFastImageProcessorInitKwargs],
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
) -> None:
super().__init__(**kwargs)
size = kwargs.pop("size", self.size)
@@ -270,7 +279,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
)
crop_size = kwargs.pop("crop_size", self.crop_size)
self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
for key in self.valid_init_kwargs.__annotations__.keys():
for key in self.valid_kwargs.__annotations__.keys():
kwarg = kwargs.pop(key, None)
if kwarg is not None:
setattr(self, key, kwarg)
@@ -553,14 +562,12 @@ class BaseImageProcessorFast(BaseImageProcessor):
def preprocess(
self,
images: ImageInput,
**kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs],
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
) -> BatchFeature:
validate_kwargs(
captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
)
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
# Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
for kwarg_name in self.valid_kwargs.__annotations__:
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
# Extract parameters that are only used for preparing the input images