Add owlv2 fast processor (#39041)

* add owlv2 fast image processor

* add Owlv2ImageProcessorFast to Owlv2Processor image_processor_class

* add Owlv2ImageProcessorFast to Owlv2Processor image_processor_class

* change references to owlVit to owlv2 in docstrings for post process methods

* change type hints from List, Dict, Tuple to list, dict, tuple

* remove unused typing imports

* add disable grouping argument to group images by shape

* run make quality and repo-consistency

* use modular

* fix auto_docstring

---------

Co-authored-by: Lewis Marshall <lewism@elderda.co.uk>
Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
This commit is contained in:
lmarshall12
2025-07-24 22:40:11 -04:00
committed by GitHub
parent 5a81d7e0b3
commit 565c035a2e
7 changed files with 737 additions and 61 deletions

View File

@@ -106,6 +106,13 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
- post_process_object_detection - post_process_object_detection
- post_process_image_guided_detection - post_process_image_guided_detection
## Owlv2ImageProcessorFast
[[autodoc]] Owlv2ImageProcessorFast
- preprocess
- post_process_object_detection
- post_process_image_guided_detection
## Owlv2Processor ## Owlv2Processor
[[autodoc]] Owlv2Processor [[autodoc]] Owlv2Processor

View File

@@ -131,7 +131,7 @@ else:
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")), ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")), ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
("owlv2", ("Owlv2ImageProcessor",)), ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")),
("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")), ("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")),
("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")), ("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")), ("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")),

View File

@@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING: if TYPE_CHECKING:
from .configuration_owlv2 import * from .configuration_owlv2 import *
from .image_processing_owlv2 import * from .image_processing_owlv2 import *
from .image_processing_owlv2_fast import *
from .modeling_owlv2 import * from .modeling_owlv2 import *
from .processing_owlv2 import * from .processing_owlv2 import *
else: else:

View File

@@ -0,0 +1,427 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/owlv2/modular_owlv2.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_owlv2.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from typing import TYPE_CHECKING, Optional, Union
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
from ...image_transforms import center_to_corners_format, group_images_by_shape, reorder_images
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import (
TensorType,
auto_docstring,
is_torch_available,
is_torchvision_available,
is_torchvision_v2_available,
)
if is_torch_available():
import torch
if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.transforms import functional as F
if TYPE_CHECKING:
from .modeling_owlv2 import Owlv2ObjectDetectionOutput
if is_torch_available():
from .image_processing_owlv2 import _scale_boxes, box_iou
class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True`, padding will be applied to the bottom and right of the image with grey pixels.
"""
do_pad: Optional[bool]
@auto_docstring
class Owlv2ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
image_mean = OPENAI_CLIP_MEAN
image_std = OPENAI_CLIP_STD
size = {"height": 960, "width": 960}
default_to_square = True
crop_size = None
do_resize = True
do_center_crop = None
do_rescale = True
do_normalize = True
do_convert_rgb = None
model_input_names = ["pixel_values"]
rescale_factor = 1 / 255
do_pad = True
valid_kwargs = Owlv2FastImageProcessorKwargs
def post_process(self, outputs, target_sizes):
"""
Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format.
Args:
outputs ([`Owlv2ObjectDetectionOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation). For visualization, this should be the image size after data
augment, but before padding.
Returns:
`list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# TODO: (amy) add support for other frameworks
warnings.warn(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
FutureWarning,
)
logits, boxes = outputs.logits, outputs.pred_boxes
if len(logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
probs = torch.max(logits, dim=-1)
scores = torch.sigmoid(probs.values)
labels = probs.indices
# Convert to [x0, y0, x1, y1] format
boxes = center_to_corners_format(boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
return results
def post_process_object_detection(
self,
outputs: "Owlv2ObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
):
"""
Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format.
Args:
outputs ([`Owlv2ObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
"""
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
batch_size = len(batch_logits)
if target_sizes is not None and len(target_sizes) != batch_size:
raise ValueError("Make sure that you pass in as many target sizes as images")
# batch_logits of shape (batch_size, num_queries, num_classes)
batch_class_logits = torch.max(batch_logits, dim=-1)
batch_scores = torch.sigmoid(batch_class_logits.values)
batch_labels = batch_class_logits.indices
# Convert to [x0, y0, x1, y1] format
batch_boxes = center_to_corners_format(batch_boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
batch_boxes = _scale_boxes(batch_boxes, target_sizes)
results = []
for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
keep = scores > threshold
scores = scores[keep]
labels = labels[keep]
boxes = boxes[keep]
results.append({"scores": scores, "labels": labels, "boxes": boxes})
return results
def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
"""
Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO
api.
Args:
outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.0):
Minimum confidence threshold to use to filter out predicted boxes.
nms_threshold (`float`, *optional*, defaults to 0.3):
IoU threshold for non-maximum suppression of overlapping boxes.
target_sizes (`torch.Tensor`, *optional*):
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
None, predictions will not be unnormalized.
Returns:
`list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model. All labels are set to None as
`Owlv2ForObjectDetection.image_guided_detection` perform one-shot object detection.
"""
logits, target_boxes = outputs.logits, outputs.target_pred_boxes
if target_sizes is not None and len(logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
if target_sizes is not None and target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
probs = torch.max(logits, dim=-1)
scores = torch.sigmoid(probs.values)
# Convert to [x0, y0, x1, y1] format
target_boxes = center_to_corners_format(target_boxes)
# Apply non-maximum suppression (NMS)
if nms_threshold < 1.0:
for idx in range(target_boxes.shape[0]):
for i in torch.argsort(-scores[idx]):
if not scores[idx][i]:
continue
ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0]
ious[i] = -1.0 # Mask self-IoU.
scores[idx][ious > nms_threshold] = 0.0
# Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
target_boxes = _scale_boxes(target_boxes, target_sizes)
# Compute box display alphas based on prediction scores
results = []
alphas = torch.zeros_like(scores)
for idx in range(target_boxes.shape[0]):
# Select scores for boxes matching the current query:
query_scores = scores[idx]
if not query_scores.nonzero().numel():
continue
# Apply threshold on scores before scaling
query_scores[query_scores < threshold] = 0.0
# Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
# All other boxes will either belong to a different query, or will not be shown.
max_score = torch.max(query_scores) + 1e-6
query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
query_alphas = torch.clip(query_alphas, 0.0, 1.0)
alphas[idx] = query_alphas
mask = alphas[idx] > 0
box_scores = alphas[idx][mask]
boxes = target_boxes[idx][mask]
results.append({"scores": box_scores, "labels": None, "boxes": boxes})
return results
def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
return super().preprocess(images, **kwargs)
def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
"""
Pad an image with zeros to the given size.
"""
height, width = images.shape[-2:]
size = max(height, width)
pad_bottom = size - height
pad_right = size - width
padding = (0, 0, pad_right, pad_bottom)
padded_image = F.pad(images, padding, fill=constant_value)
return padded_image
def pad(
self,
images: list["torch.Tensor"],
disable_grouping: Optional[bool],
constant_value: float = 0.5,
) -> list["torch.Tensor"]:
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
stacked_images = self._pad_images(
stacked_images,
constant_value=constant_value,
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
return processed_images
def resize(
self,
image: "torch.Tensor",
size: SizeDict,
anti_aliasing: bool = True,
anti_aliasing_sigma=None,
**kwargs,
) -> "torch.Tensor":
"""
Resize an image as per the original implementation.
Args:
image (`Tensor`):
Image to resize.
size (`dict[str, int]`):
Dictionary containing the height and width to resize the image to.
anti_aliasing (`bool`, *optional*, defaults to `True`):
Whether to apply anti-aliasing when downsampling the image.
anti_aliasing_sigma (`float`, *optional*, defaults to `None`):
Standard deviation for Gaussian kernel when downsampling the image. If `None`, it will be calculated
automatically.
"""
output_shape = (size.height, size.width)
input_shape = image.shape
# select height and width from input tensor
factors = torch.tensor(input_shape[2:]).to(image.device) / torch.tensor(output_shape).to(image.device)
if anti_aliasing:
if anti_aliasing_sigma is None:
anti_aliasing_sigma = ((factors - 1) / 2).clamp(min=0)
else:
anti_aliasing_sigma = torch.atleast_1d(anti_aliasing_sigma) * torch.ones_like(factors)
if torch.any(anti_aliasing_sigma < 0):
raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero")
elif torch.any((anti_aliasing_sigma > 0) & (factors <= 1)):
warnings.warn(
"Anti-aliasing standard deviation greater than zero but not down-sampling along all axes"
)
if torch.any(anti_aliasing_sigma == 0):
filtered = image
else:
kernel_sizes = 2 * torch.ceil(3 * anti_aliasing_sigma).int() + 1
filtered = F.gaussian_blur(
image, (kernel_sizes[0], kernel_sizes[1]), sigma=anti_aliasing_sigma.tolist()
)
else:
filtered = image
out = F.resize(filtered, size=(size.height, size.width), antialias=False)
return out
def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
interpolation: Optional["F.InterpolationMode"],
do_pad: bool,
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
**kwargs,
) -> BatchFeature:
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
# Rescale images before other operations as done in original implementation
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, False, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
if do_pad:
processed_images = self.pad(processed_images, disable_grouping=disable_grouping)
grouped_images, grouped_images_index = group_images_by_shape(
processed_images, disable_grouping=disable_grouping
)
resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
resized_stack = self.resize(
image=stacked_images,
size=size,
interpolation=interpolation,
input_data_format=ChannelDimension.FIRST,
)
resized_images_grouped[shape] = resized_stack
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, False, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
__all__ = ["Owlv2ImageProcessorFast"]

View File

@@ -0,0 +1,240 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Image processor class for OWLv2."""
import warnings
from typing import Optional, Union
from transformers.models.owlvit.image_processing_owlvit_fast import OwlViTImageProcessorFast
from ...image_processing_utils_fast import (
BatchFeature,
DefaultFastImageProcessorKwargs,
)
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
SizeDict,
)
from ...processing_utils import Unpack
from ...utils import (
TensorType,
auto_docstring,
is_torch_available,
is_torchvision_available,
is_torchvision_v2_available,
)
if is_torch_available():
import torch
if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.transforms import functional as F
class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
r"""
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True`, padding will be applied to the bottom and right of the image with grey pixels.
"""
do_pad: Optional[bool]
@auto_docstring
class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
resample = PILImageResampling.BILINEAR
image_mean = OPENAI_CLIP_MEAN
image_std = OPENAI_CLIP_STD
size = {"height": 960, "width": 960}
rescale_factor = 1 / 255
do_resize = True
do_rescale = True
do_normalize = True
do_pad = True
valid_kwargs = Owlv2FastImageProcessorKwargs
crop_size = None
do_center_crop = None
def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
OwlViTImageProcessorFast().__init__(**kwargs)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
return OwlViTImageProcessorFast().preprocess(images, **kwargs)
def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
"""
Pad an image with zeros to the given size.
"""
height, width = images.shape[-2:]
size = max(height, width)
pad_bottom = size - height
pad_right = size - width
padding = (0, 0, pad_right, pad_bottom)
padded_image = F.pad(images, padding, fill=constant_value)
return padded_image
def pad(
self,
images: list["torch.Tensor"],
disable_grouping: Optional[bool],
constant_value: float = 0.5,
) -> list["torch.Tensor"]:
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
stacked_images = self._pad_images(
stacked_images,
constant_value=constant_value,
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
return processed_images
def resize(
self,
image: "torch.Tensor",
size: SizeDict,
anti_aliasing: bool = True,
anti_aliasing_sigma=None,
**kwargs,
) -> "torch.Tensor":
"""
Resize an image as per the original implementation.
Args:
image (`Tensor`):
Image to resize.
size (`dict[str, int]`):
Dictionary containing the height and width to resize the image to.
anti_aliasing (`bool`, *optional*, defaults to `True`):
Whether to apply anti-aliasing when downsampling the image.
anti_aliasing_sigma (`float`, *optional*, defaults to `None`):
Standard deviation for Gaussian kernel when downsampling the image. If `None`, it will be calculated
automatically.
"""
output_shape = (size.height, size.width)
input_shape = image.shape
# select height and width from input tensor
factors = torch.tensor(input_shape[2:]).to(image.device) / torch.tensor(output_shape).to(image.device)
if anti_aliasing:
if anti_aliasing_sigma is None:
anti_aliasing_sigma = ((factors - 1) / 2).clamp(min=0)
else:
anti_aliasing_sigma = torch.atleast_1d(anti_aliasing_sigma) * torch.ones_like(factors)
if torch.any(anti_aliasing_sigma < 0):
raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero")
elif torch.any((anti_aliasing_sigma > 0) & (factors <= 1)):
warnings.warn(
"Anti-aliasing standard deviation greater than zero but not down-sampling along all axes"
)
if torch.any(anti_aliasing_sigma == 0):
filtered = image
else:
kernel_sizes = 2 * torch.ceil(3 * anti_aliasing_sigma).int() + 1
filtered = F.gaussian_blur(
image, (kernel_sizes[0], kernel_sizes[1]), sigma=anti_aliasing_sigma.tolist()
)
else:
filtered = image
out = F.resize(filtered, size=(size.height, size.width), antialias=False)
return out
def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
interpolation: Optional["F.InterpolationMode"],
do_pad: bool,
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
**kwargs,
) -> BatchFeature:
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
# Rescale images before other operations as done in original implementation
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, False, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
if do_pad:
processed_images = self.pad(processed_images, disable_grouping=disable_grouping)
grouped_images, grouped_images_index = group_images_by_shape(
processed_images, disable_grouping=disable_grouping
)
resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
resized_stack = self.resize(
image=stacked_images,
size=size,
interpolation=interpolation,
input_data_format=ChannelDimension.FIRST,
)
resized_images_grouped[shape] = resized_stack
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, False, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
__all__ = ["Owlv2ImageProcessorFast"]

View File

@@ -56,19 +56,19 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
class Owlv2Processor(ProcessorMixin): class Owlv2Processor(ProcessorMixin):
r""" r"""
Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`]/[`Owlv2ImageProcessorFast`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
a single processor that inherits both the image processor and tokenizer functionalities. See the a single processor that inherits both the image processor and tokenizer functionalities. See the
[`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information. [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
Args: Args:
image_processor ([`Owlv2ImageProcessor`]): image_processor ([`Owlv2ImageProcessor`, `Owlv2ImageProcessorFast`]):
The image processor is a required input. The image processor is a required input.
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]): tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
image_processor_class = "Owlv2ImageProcessor" image_processor_class = ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor, tokenizer, **kwargs): def __init__(self, image_processor, tokenizer, **kwargs):

View File

@@ -16,7 +16,7 @@
import unittest import unittest
from transformers.testing_utils import require_torch, require_vision, slow from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -29,6 +29,8 @@ if is_vision_available():
if is_torch_available(): if is_torch_available():
import torch import torch
from transformers import Owlv2ImageProcessorFast
class Owlv2ImageProcessingTester: class Owlv2ImageProcessingTester:
def __init__( def __init__(
@@ -87,6 +89,7 @@ class Owlv2ImageProcessingTester:
@require_vision @require_vision
class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Owlv2ImageProcessor if is_vision_available() else None image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
fast_image_processing_class = Owlv2ImageProcessorFast if is_torchvision_available() else None
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -97,7 +100,8 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
return self.image_processor_tester.prepare_image_processor_dict() return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self): def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size")) self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_normalize"))
@@ -105,17 +109,19 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(hasattr(image_processing, "image_std")) self.assertTrue(hasattr(image_processing, "image_std"))
def test_image_processor_from_dict_with_kwargs(self): def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict) for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18}) self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict( image_processor = image_processing_class.from_dict(
self.image_processor_dict, size={"height": 42, "width": 42} self.image_processor_dict, size={"height": 42, "width": 42}
) )
self.assertEqual(image_processor.size, {"height": 42, "width": 42}) self.assertEqual(image_processor.size, {"height": 42, "width": 42})
@slow @slow
def test_image_processor_integration_test(self): def test_image_processor_integration_test(self):
processor = Owlv2ImageProcessor() for image_processing_class in self.image_processor_list:
processor = image_processing_class()
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
pixel_values = processor(image, return_tensors="pt").pixel_values pixel_values = processor(image, return_tensors="pt").pixel_values
@@ -125,8 +131,9 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
@slow @slow
def test_image_processor_integration_test_resize(self): def test_image_processor_integration_test_resize(self):
for use_fast in [False, True]:
checkpoint = "google/owlv2-base-patch16-ensemble" checkpoint = "google/owlv2-base-patch16-ensemble"
processor = AutoProcessor.from_pretrained(checkpoint) processor = AutoProcessor.from_pretrained(checkpoint, use_fast=use_fast)
model = Owlv2ForObjectDetection.from_pretrained(checkpoint) model = Owlv2ForObjectDetection.from_pretrained(checkpoint)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
@@ -147,10 +154,7 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=[target_size])[0] results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=[target_size])[0]
boxes = results["boxes"] boxes = results["boxes"]
self.assertTrue( torch.testing.assert_close(boxes, expected_boxes, atol=1e-1, rtol=1e-1)
torch.allclose(boxes, expected_boxes, atol=1e-2),
f"Single image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
)
# batch of images # batch of images
inputs = processor(text=[text, text], images=[image, image], return_tensors="pt") inputs = processor(text=[text, text], images=[image, image], return_tensors="pt")
@@ -162,10 +166,7 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
for result in results: for result in results:
boxes = result["boxes"] boxes = result["boxes"]
self.assertTrue( torch.testing.assert_close(boxes, expected_boxes, atol=1e-1, rtol=1e-1)
torch.allclose(boxes, expected_boxes, atol=1e-2),
f"Batch image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
)
@unittest.skip(reason="OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy @unittest.skip(reason="OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self): def test_call_numpy_4_channels(self):