diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 1b4e92bc4e..696a1b0377 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -50,20 +50,22 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) ->>> texts = [["a photo of a cat", "a photo of a dog"]] ->>> inputs = processor(text=texts, images=image, return_tensors="pt") +>>> text_labels = [["a photo of a cat", "a photo of a dog"]] +>>> inputs = processor(text=text_labels, images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] ->>> target_sizes = torch.Tensor([image.size[::-1]]) ->>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax) ->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) ->>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ->>> text = texts[i] ->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] ->>> for box, score, label in zip(boxes, scores, labels): +>>> target_sizes = torch.tensor([(image.height, image.width)]) +>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) +>>> results = processor.post_process_grounded_object_detection( +... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels +... ) +>>> # Retrieve predictions for the first image for the corresponding text queries +>>> result = results[0] +>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] +>>> for box, score, text_label in zip(boxes, scores, text_labels): ... box = [round(i, 2) for i in box.tolist()] -... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35] Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13] ``` @@ -103,6 +105,9 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce ## Owlv2Processor [[autodoc]] Owlv2Processor + - __call__ + - post_process_grounded_object_detection + - post_process_image_guided_detection ## Owlv2Model diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md index c40d3a9e7a..519648bbd8 100644 --- a/docs/source/en/model_doc/owlvit.md +++ b/docs/source/en/model_doc/owlvit.md @@ -49,20 +49,22 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) ->>> texts = [["a photo of a cat", "a photo of a dog"]] ->>> inputs = processor(text=texts, images=image, return_tensors="pt") +>>> text_labels = [["a photo of a cat", "a photo of a dog"]] +>>> inputs = processor(text=text_labels, images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] ->>> target_sizes = torch.Tensor([image.size[::-1]]) +>>> target_sizes = torch.tensor([(image.height, image.width)]) >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) ->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) ->>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ->>> text = texts[i] ->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] ->>> for box, score, label in zip(boxes, scores, labels): +>>> results = processor.post_process_grounded_object_detection( +... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels +... ) +>>> # Retrieve predictions for the first image for the corresponding text queries +>>> result = results[0] +>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] +>>> for box, score, text_label in zip(boxes, scores, text_labels): ... box = [round(i, 2) for i in box.tolist()] -... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] ``` @@ -91,16 +93,12 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de - post_process_object_detection - post_process_image_guided_detection -## OwlViTFeatureExtractor - -[[autodoc]] OwlViTFeatureExtractor - - __call__ - - post_process - - post_process_image_guided_detection - ## OwlViTProcessor [[autodoc]] OwlViTProcessor + - __call__ + - post_process_grounded_object_detection + - post_process_image_guided_detection ## OwlViTModel diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index f3d2117ca8..e3c99568cd 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -17,7 +17,7 @@ import io import pathlib from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union import numpy as np @@ -77,6 +77,9 @@ if is_scipy_available(): import scipy.special import scipy.stats +if TYPE_CHECKING: + from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -806,6 +809,35 @@ def compute_segments( return segmentation, segments +# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + class GroundingDinoImageProcessor(BaseImageProcessor): r""" Constructs a Grounding DINO image processor. @@ -1533,7 +1565,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor): # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino def post_process_object_detection( - self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None + self, + outputs: "GroundingDinoObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, ): """ Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, @@ -1542,48 +1577,43 @@ class GroundingDinoImageProcessor(BaseImageProcessor): Args: outputs ([`GroundingDinoObjectDetectionOutput`]): Raw outputs of the model. - threshold (`float`, *optional*): + threshold (`float`, *optional*, defaults to 0.1): Score threshold to keep object detection predictions. target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. """ - # TODO: (amy) add support for other frameworks - logits, boxes = outputs.logits, outputs.pred_boxes + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) - if target_sizes is not None: - if len(logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") - probs = torch.max(logits, dim=-1) - scores = torch.sigmoid(probs.values) - labels = probs.indices + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices # Convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(boxes) + batch_boxes = center_to_corners_format(batch_boxes) # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] + batch_boxes = _scale_boxes(batch_boxes, target_sizes) results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) return results diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py index ac637d62dd..1dfdfbd1c2 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2.py +++ b/src/transformers/models/owlv2/image_processing_owlv2.py @@ -15,7 +15,7 @@ """Image processor class for OWLv2.""" import warnings -from typing import Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np @@ -60,10 +60,43 @@ if is_vision_available(): if is_scipy_available(): from scipy import ndimage as ndi +if TYPE_CHECKING: + from .modeling_owlv2 import Owlv2ObjectDetectionOutput logger = logging.get_logger(__name__) +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor") + + # for owlv2 image is padded to max size unlike owlvit, thats why we have to scale boxes to max size + max_size = torch.max(image_height, image_width) + + scale_factor = torch.stack([max_size, max_size, max_size, max_size], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + # Copied from transformers.models.owlvit.image_processing_owlvit._upcast def _upcast(t): # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type @@ -466,62 +499,57 @@ class Owlv2ImageProcessor(BaseImageProcessor): data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->Owlv2 def post_process_object_detection( - self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None + self, + outputs: "Owlv2ObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, ): """ - Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. Args: - outputs ([`OwlViTObjectDetectionOutput`]): + outputs ([`Owlv2ObjectDetectionOutput`]): Raw outputs of the model. - threshold (`float`, *optional*): + threshold (`float`, *optional*, defaults to 0.1): Score threshold to keep object detection predictions. target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. """ - # TODO: (amy) add support for other frameworks - logits, boxes = outputs.logits, outputs.pred_boxes + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) - if target_sizes is not None: - if len(logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") - probs = torch.max(logits, dim=-1) - scores = torch.sigmoid(probs.values) - labels = probs.indices + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices # Convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(boxes) + batch_boxes = center_to_corners_format(batch_boxes) # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - - # Rescale coordinates, image is padded to square for inference, - # that is why we need to scale boxes to the max size - size = torch.max(img_h, img_w) - scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device) - - boxes = boxes * scale_fct[:, None, :] + batch_boxes = _scale_boxes(batch_boxes, target_sizes) results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) return results @@ -574,13 +602,7 @@ class Owlv2ImageProcessor(BaseImageProcessor): # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: - if isinstance(target_sizes, List): - img_h = torch.tensor([i[0] for i in target_sizes]) - img_w = torch.tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) - target_boxes = target_boxes * scale_fct[:, None, :] + target_boxes = _scale_boxes(target_boxes, target_sizes) # Compute box display alphas based on prediction scores results = [] diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index d8a46d0b0f..d69bcaa87f 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -1749,33 +1749,30 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel): >>> import requests >>> from PIL import Image >>> import torch - >>> from transformers import AutoProcessor, Owlv2ForObjectDetection - >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") + >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection + + >>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> texts = [["a photo of a cat", "a photo of a dog"]] - >>> inputs = processor(text=texts, images=image, return_tensors="pt") + >>> text_labels = [["a photo of a cat", "a photo of a dog"]] + >>> inputs = processor(text=text_labels, images=image, return_tensors="pt") + >>> outputs = model(**inputs) - >>> # forward pass - >>> with torch.no_grad(): - ... outputs = model(**inputs) - - >>> target_sizes = torch.Tensor([image.size[::-1]]) - >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores - >>> results = processor.post_process_object_detection( - ... outputs=outputs, threshold=0.2, target_sizes=target_sizes + >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] + >>> target_sizes = torch.tensor([(image.height, image.width)]) + >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> results = processor.post_process_grounded_object_detection( + ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels ... ) - - >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries - >>> text = texts[i] - >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] - - >>> for box, score, label in zip(boxes, scores, labels): + >>> # Retrieve predictions for the first image for the corresponding text queries + >>> result = results[0] + >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] + >>> for box, score, text_label in zip(boxes, scores, text_labels): ... box = [round(i, 2) for i in box.tolist()] - ... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") + ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35] Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13] ```""" diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 4a0b5a712e..b79ab626f7 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -16,13 +16,18 @@ Image/Text processor class for OWLv2 """ -from typing import List +import warnings +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import numpy as np from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding -from ...utils import is_flax_available, is_tf_available, is_torch_available +from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available + + +if TYPE_CHECKING: + from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput class Owlv2Processor(ProcessorMixin): @@ -45,7 +50,7 @@ class Owlv2Processor(ProcessorMixin): def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor, tokenizer) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2 + # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2 def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): """ Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and @@ -157,21 +162,101 @@ class Owlv2Processor(ProcessorMixin): else: return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2 + # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2 def post_process_object_detection(self, *args, **kwargs): """ - This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer + This method forwards all its arguments to [`Owlv2ImageProcessor.post_process_object_detection`]. Please refer to the docstring of this method for more information. """ + warnings.warn( + "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. " + "Use `post_process_grounded_object_detection` instead.", + FutureWarning, + ) return self.image_processor.post_process_object_detection(*args, **kwargs) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2 - def post_process_image_guided_detection(self, *args, **kwargs): + # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_grounded_object_detection with OwlViT->Owlv2 + def post_process_grounded_object_detection( + self, + outputs: "Owlv2ObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, + text_labels: Optional[List[List[str]]] = None, + ): """ - This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`]. - Please refer to the docstring of this method for more information. + Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`Owlv2ObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + text_labels (`List[List[str]]`, *optional*): + List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be + set to `None`. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + - "text_labels": The text labels for each predicted bounding box on the image. """ - return self.image_processor.post_process_image_guided_detection(*args, **kwargs) + output = self.image_processor.post_process_object_detection( + outputs=outputs, threshold=threshold, target_sizes=target_sizes + ) + + if text_labels is not None and len(text_labels) != len(output): + raise ValueError("Make sure that you pass in as many lists of text labels as images") + + # adding text labels to the output + if text_labels is not None: + for image_output, image_text_labels in zip(output, text_labels): + object_text_labels = [image_text_labels[i] for i in image_output["labels"]] + image_output["text_labels"] = object_text_labels + else: + for image_output in output: + image_output["text_labels"] = None + + return output + + # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OwlViT->Owlv2 + def post_process_image_guided_detection( + self, + outputs: "Owlv2ImageGuidedObjectDetectionOutput", + threshold: float = 0.0, + nms_threshold: float = 0.3, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, + ): + """ + Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO + api. + + Args: + outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.0): + Minimum confidence threshold to use to filter out predicted boxes. + nms_threshold (`float`, *optional*, defaults to 0.3): + IoU threshold for non-maximum suppression of overlapping boxes. + target_sizes (`torch.Tensor`, *optional*): + Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in + the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to + None, predictions will not be unnormalized. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + - "labels": Set to `None`. + """ + return self.image_processor.post_process_image_guided_detection( + outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes + ) # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py index 05c2fb908c..59c6465734 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit.py +++ b/src/transformers/models/owlvit/image_processing_owlvit.py @@ -15,7 +15,7 @@ """Image processor class for OwlViT""" import warnings -from typing import Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np @@ -43,6 +43,9 @@ from ...image_utils import ( from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging +if TYPE_CHECKING: + from .modeling_owlvit import OwlViTObjectDetectionOutput + if is_torch_available(): import torch @@ -58,6 +61,34 @@ def _upcast(t): return t if t.dtype in (torch.int32, torch.int64) else t.int() +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + def box_area(boxes): """ Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. @@ -459,7 +490,10 @@ class OwlViTImageProcessor(BaseImageProcessor): return results def post_process_object_detection( - self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None + self, + outputs: "OwlViTObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, ): """ Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, @@ -468,52 +502,46 @@ class OwlViTImageProcessor(BaseImageProcessor): Args: outputs ([`OwlViTObjectDetectionOutput`]): Raw outputs of the model. - threshold (`float`, *optional*): + threshold (`float`, *optional*, defaults to 0.1): Score threshold to keep object detection predictions. target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. """ - # TODO: (amy) add support for other frameworks - logits, boxes = outputs.logits, outputs.pred_boxes + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) - if target_sizes is not None: - if len(logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") - probs = torch.max(logits, dim=-1) - scores = torch.sigmoid(probs.values) - labels = probs.indices + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices # Convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(boxes) + batch_boxes = center_to_corners_format(batch_boxes) # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] + batch_boxes = _scale_boxes(batch_boxes, target_sizes) results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) return results - # TODO: (Amy) Make compatible with other frameworks def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None): """ Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO @@ -562,13 +590,7 @@ class OwlViTImageProcessor(BaseImageProcessor): # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: - if isinstance(target_sizes, List): - img_h = torch.tensor([i[0] for i in target_sizes]) - img_w = torch.tensor([i[1] for i in target_sizes]) - else: - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) - target_boxes = target_boxes * scale_fct[:, None, :] + target_boxes = _scale_boxes(target_boxes, target_sizes) # Compute box display alphas based on prediction scores results = [] diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 14f41fafac..d9c0e72409 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -1689,31 +1689,30 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel): >>> import requests >>> from PIL import Image >>> import torch - >>> from transformers import AutoProcessor, OwlViTForObjectDetection - >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") + >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection + + >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> texts = [["a photo of a cat", "a photo of a dog"]] - >>> inputs = processor(text=texts, images=image, return_tensors="pt") + >>> text_labels = [["a photo of a cat", "a photo of a dog"]] + >>> inputs = processor(text=text_labels, images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] - >>> target_sizes = torch.Tensor([image.size[::-1]]) - >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores - >>> results = processor.post_process_object_detection( - ... outputs=outputs, threshold=0.1, target_sizes=target_sizes + >>> target_sizes = torch.tensor([(image.height, image.width)]) + >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> results = processor.post_process_grounded_object_detection( + ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels ... ) - - >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries - >>> text = texts[i] - >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] - - >>> for box, score, label in zip(boxes, scores, labels): + >>> # Retrieve predictions for the first image for the corresponding text queries + >>> result = results[0] + >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] + >>> for box, score, text_label in zip(boxes, scores, text_labels): ... box = [round(i, 2) for i in box.tolist()] - ... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") + ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] ```""" diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 49e913a384..dd74da5546 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -17,13 +17,17 @@ Image/Text processor class for OWL-ViT """ import warnings -from typing import List +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import numpy as np from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding -from ...utils import is_flax_available, is_tf_available, is_torch_available +from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available + + +if TYPE_CHECKING: + from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput class OwlViTProcessor(ProcessorMixin): @@ -184,14 +188,93 @@ class OwlViTProcessor(ProcessorMixin): This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer to the docstring of this method for more information. """ + warnings.warn( + "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. " + "Use `post_process_grounded_object_detection` instead.", + FutureWarning, + ) return self.image_processor.post_process_object_detection(*args, **kwargs) - def post_process_image_guided_detection(self, *args, **kwargs): + def post_process_grounded_object_detection( + self, + outputs: "OwlViTObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, + text_labels: Optional[List[List[str]]] = None, + ): """ - This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`]. - Please refer to the docstring of this method for more information. + Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`OwlViTObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + text_labels (`List[List[str]]`, *optional*): + List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be + set to `None`. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + - "text_labels": The text labels for each predicted bounding box on the image. """ - return self.image_processor.post_process_image_guided_detection(*args, **kwargs) + output = self.image_processor.post_process_object_detection( + outputs=outputs, threshold=threshold, target_sizes=target_sizes + ) + + if text_labels is not None and len(text_labels) != len(output): + raise ValueError("Make sure that you pass in as many lists of text labels as images") + + # adding text labels to the output + if text_labels is not None: + for image_output, image_text_labels in zip(output, text_labels): + object_text_labels = [image_text_labels[i] for i in image_output["labels"]] + image_output["text_labels"] = object_text_labels + else: + for image_output in output: + image_output["text_labels"] = None + + return output + + def post_process_image_guided_detection( + self, + outputs: "OwlViTImageGuidedObjectDetectionOutput", + threshold: float = 0.0, + nms_threshold: float = 0.3, + target_sizes: Optional[Union[TensorType, List[Tuple]]] = None, + ): + """ + Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO + api. + + Args: + outputs ([`OwlViTImageGuidedObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.0): + Minimum confidence threshold to use to filter out predicted boxes. + nms_threshold (`float`, *optional*, defaults to 0.3): + IoU threshold for non-maximum suppression of overlapping boxes. + target_sizes (`torch.Tensor`, *optional*): + Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in + the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to + None, predictions will not be unnormalized. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + - "labels": Set to `None`. + """ + return self.image_processor.post_process_image_guided_detection( + outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes + ) def batch_decode(self, *args, **kwargs): """ diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index b35f58e99a..cab47c2b5e 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -974,8 +974,9 @@ class Owlv2ModelIntegrationTest(unittest.TestCase): processor = OwlViTProcessor.from_pretrained(model_name) image = prepare_img() + text_labels = [["a photo of a cat", "a photo of a dog"]] inputs = processor( - text=[["a photo of a cat", "a photo of a dog"]], + text=text_labels, images=image, max_length=16, padding="max_length", @@ -991,11 +992,31 @@ class Owlv2ModelIntegrationTest(unittest.TestCase): expected_slice_logits = torch.tensor( [[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]] ).to(torch_device) - self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4)) + resulted_slice_logits = outputs.logits[0, :3, :3] + max_diff = torch.max(torch.abs(resulted_slice_logits - expected_slice_logits)).item() + self.assertLess(max_diff, 3e-4) + expected_slice_boxes = torch.tensor( [[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]], ).to(torch_device) - self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) + resulted_slice_boxes = outputs.pred_boxes[0, :3, :3] + max_diff = torch.max(torch.abs(resulted_slice_boxes - expected_slice_boxes)).item() + self.assertLess(max_diff, 3e-4) + + # test post-processing + post_processed_output = processor.post_process_grounded_object_detection(outputs) + self.assertIsNone(post_processed_output[0]["text_labels"]) + + post_processed_output_with_text_labels = processor.post_process_grounded_object_detection( + outputs, text_labels=text_labels + ) + + objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist() + self.assertListEqual(objects_labels, [0, 0]) + + objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"] + self.assertIsNotNone(objects_text_labels) + self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"]) @slow def test_inference_one_shot_object_detection(self): diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index 545fee0c4f..d207135a58 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -967,8 +967,9 @@ class OwlViTModelIntegrationTest(unittest.TestCase): processor = OwlViTProcessor.from_pretrained(model_name) image = prepare_img() + text_labels = [["a photo of a cat", "a photo of a dog"]] inputs = processor( - text=[["a photo of a cat", "a photo of a dog"]], + text=text_labels, images=image, max_length=16, padding="max_length", @@ -986,6 +987,21 @@ class OwlViTModelIntegrationTest(unittest.TestCase): ).to(torch_device) self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) + # test post-processing + post_processed_output = processor.post_process_grounded_object_detection(outputs) + self.assertIsNone(post_processed_output[0]["text_labels"]) + + post_processed_output_with_text_labels = processor.post_process_grounded_object_detection( + outputs, text_labels=text_labels + ) + + objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist() + self.assertListEqual(objects_labels, [0, 0]) + + objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"] + self.assertIsNotNone(objects_text_labels) + self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"]) + @slow def test_inference_one_shot_object_detection(self): model_name = "google/owlvit-base-patch32" diff --git a/utils/check_repo.py b/utils/check_repo.py index d35bf27420..b4aed44062 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -973,6 +973,7 @@ DEPRECATED_OBJECTS = [ "xnli_processors", "xnli_tasks_num_labels", "TFTrainingArguments", + "OwlViTFeatureExtractor", ] # Exceptionally, some objects should not be documented after all rules passed.