OwlViT/Owlv2 post processing standardization (#34929)

* Refactor owlvit post_process_object_detection + add text_labels * Fix copies in grounding dino * Sync with Owlv2 postprocessing * Add post_process_grounded_object_detection method to processor, deprecate post_process_object_detection * Add test cases * Move text_labels to processors only * [run-slow] owlvit owlv2 * [run-slow] owlvit, owlv2 * Update snippets * Update docs structure * Update deprecated objects for check_repo * Update docstring for post processing of image guided object detection
2025-01-17 13:58:28 +00:00
parent add5f0566c
commit 94ae9a8da1
12 changed files with 467 additions and 188 deletions
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@@ -50,20 +50,22 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
 >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
->>> texts = [["a photo of a cat", "a photo of a dog"]]
+>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
->>> inputs = processor(text=texts, images=image, return_tensors="pt")
+>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
->>> target_sizes = torch.Tensor([image.size[::-1]])
+>>> target_sizes = torch.tensor([(image.height, image.width)])
->>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
+>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
+>>> results = processor.post_process_grounded_object_detection(
->>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
+...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
->>> text = texts[i]
+... )
->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+>>> # Retrieve predictions for the first image for the corresponding text queries
->>> for box, score, label in zip(boxes, scores, labels):
+>>> result = results[0]
 >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
 >>> for box, score, text_label in zip(boxes, scores, text_labels):
 ...     box = [round(i, 2) for i in box.tolist()]
-...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
 Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
 Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
 ```
@@ -103,6 +105,9 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
 ## Owlv2Processor
 [[autodoc]] Owlv2Processor
    - __call__
    - post_process_grounded_object_detection
    - post_process_image_guided_detection
 ## Owlv2Model
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -49,20 +49,22 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
 >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
->>> texts = [["a photo of a cat", "a photo of a dog"]]
+>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
->>> inputs = processor(text=texts, images=image, return_tensors="pt")
+>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
->>> target_sizes = torch.Tensor([image.size[::-1]])
+>>> target_sizes = torch.tensor([(image.height, image.width)])
 >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
+>>> results = processor.post_process_grounded_object_detection(
->>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
+...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
->>> text = texts[i]
+... )
->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+>>> # Retrieve predictions for the first image for the corresponding text queries
->>> for box, score, label in zip(boxes, scores, labels):
+>>> result = results[0]
 >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
 >>> for box, score, text_label in zip(boxes, scores, text_labels):
 ...     box = [round(i, 2) for i in box.tolist()]
-...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
 Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
 Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
 ```
@@ -91,16 +93,12 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de
    - post_process_object_detection
    - post_process_image_guided_detection
 ## OwlViTFeatureExtractor
 [[autodoc]] OwlViTFeatureExtractor
    - __call__
    - post_process
    - post_process_image_guided_detection
 ## OwlViTProcessor
 [[autodoc]] OwlViTProcessor
    - __call__
    - post_process_grounded_object_detection
    - post_process_image_guided_detection
 ## OwlViTModel
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -17,7 +17,7 @@
 import io
 import pathlib
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -77,6 +77,9 @@ if is_scipy_available():
    import scipy.special
    import scipy.stats
 if TYPE_CHECKING:
    from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -806,6 +809,35 @@ def compute_segments(
    return segmentation, segments
 # Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes
 def _scale_boxes(boxes, target_sizes):
    """
    Scale batch of bounding boxes to the target sizes.
    Args:
        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
        target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
    Returns:
        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
    """
    if isinstance(target_sizes, (list, tuple)):
        image_height = torch.tensor([i[0] for i in target_sizes])
        image_width = torch.tensor([i[1] for i in target_sizes])
    elif isinstance(target_sizes, torch.Tensor):
        image_height, image_width = target_sizes.unbind(1)
    else:
        raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
    boxes = boxes * scale_factor
    return boxes
 class GroundingDinoImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Grounding DINO image processor.
@@ -1533,7 +1565,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
    def post_process_object_detection(
-        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
        outputs: "GroundingDinoObjectDetectionOutput",
        threshold: float = 0.1,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
    ):
        """
        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
@@ -1542,48 +1577,43 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
        Args:
            outputs ([`GroundingDinoObjectDetectionOutput`]):
                Raw outputs of the model.
-            threshold (`float`, *optional*):
+            threshold (`float`, *optional*, defaults to 0.1):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
-            in the batch as predicted by the model.
+            - "scores": The confidence scores for each predicted box on the image.
            - "labels": Indexes of the classes predicted by the model on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
        """
-        # TODO: (amy) add support for other frameworks
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
-        logits, boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
-        if target_sizes is not None:
+        if target_sizes is not None and len(target_sizes) != batch_size:
-            if len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as images")
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )
-        probs = torch.max(logits, dim=-1)
+        # batch_logits of shape (batch_size, num_queries, num_classes)
-        scores = torch.sigmoid(probs.values)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
-        labels = probs.indices
+        batch_scores = torch.sigmoid(batch_class_logits.values)
        batch_labels = batch_class_logits.indices
        # Convert to [x0, y0, x1, y1] format
-        boxes = center_to_corners_format(boxes)
+        batch_boxes = center_to_corners_format(batch_boxes)
        # Convert from relative [0, 1] to absolute [0, height] coordinates
        if target_sizes is not None:
-            if isinstance(target_sizes, List):
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]
        results = []
-        for s, l, b in zip(scores, labels, boxes):
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
-            score = s[s > threshold]
+            keep = scores > threshold
-            label = l[s > threshold]
+            scores = scores[keep]
-            box = b[s > threshold]
+            labels = labels[keep]
-            results.append({"scores": score, "labels": label, "boxes": box})
+            boxes = boxes[keep]
            results.append({"scores": scores, "labels": labels, "boxes": boxes})
        return results
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -15,7 +15,7 @@
 """Image processor class for OWLv2."""
 import warnings
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -60,10 +60,43 @@ if is_vision_available():
 if is_scipy_available():
    from scipy import ndimage as ndi
 if TYPE_CHECKING:
    from .modeling_owlv2 import Owlv2ObjectDetectionOutput
 logger = logging.get_logger(__name__)
 def _scale_boxes(boxes, target_sizes):
    """
    Scale batch of bounding boxes to the target sizes.
    Args:
        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
        target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
    Returns:
        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
    """
    if isinstance(target_sizes, (list, tuple)):
        image_height = torch.tensor([i[0] for i in target_sizes])
        image_width = torch.tensor([i[1] for i in target_sizes])
    elif isinstance(target_sizes, torch.Tensor):
        image_height, image_width = target_sizes.unbind(1)
    else:
        raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
    # for owlv2 image is padded to max size unlike owlvit, thats why we have to scale boxes to max size
    max_size = torch.max(image_height, image_width)
    scale_factor = torch.stack([max_size, max_size, max_size, max_size], dim=1)
    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
    boxes = boxes * scale_factor
    return boxes
 # Copied from transformers.models.owlvit.image_processing_owlvit._upcast
 def _upcast(t):
    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
@@ -466,62 +499,57 @@ class Owlv2ImageProcessor(BaseImageProcessor):
        data = {"pixel_values": images}
        return BatchFeature(data=data, tensor_type=return_tensors)
    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->Owlv2
    def post_process_object_detection(
-        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
        outputs: "Owlv2ObjectDetectionOutput",
        threshold: float = 0.1,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
    ):
        """
-        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.
        Args:
-            outputs ([`OwlViTObjectDetectionOutput`]):
+            outputs ([`Owlv2ObjectDetectionOutput`]):
                Raw outputs of the model.
-            threshold (`float`, *optional*):
+            threshold (`float`, *optional*, defaults to 0.1):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
-            in the batch as predicted by the model.
+            - "scores": The confidence scores for each predicted box on the image.
            - "labels": Indexes of the classes predicted by the model on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
        """
-        # TODO: (amy) add support for other frameworks
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
-        logits, boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
-        if target_sizes is not None:
+        if target_sizes is not None and len(target_sizes) != batch_size:
-            if len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as images")
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )
-        probs = torch.max(logits, dim=-1)
+        # batch_logits of shape (batch_size, num_queries, num_classes)
-        scores = torch.sigmoid(probs.values)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
-        labels = probs.indices
+        batch_scores = torch.sigmoid(batch_class_logits.values)
        batch_labels = batch_class_logits.indices
        # Convert to [x0, y0, x1, y1] format
-        boxes = center_to_corners_format(boxes)
+        batch_boxes = center_to_corners_format(batch_boxes)
        # Convert from relative [0, 1] to absolute [0, height] coordinates
        if target_sizes is not None:
-            if isinstance(target_sizes, List):
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)
            # Rescale coordinates, image is padded to square for inference,
            # that is why we need to scale boxes to the max size
            size = torch.max(img_h, img_w)
            scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]
        results = []
-        for s, l, b in zip(scores, labels, boxes):
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
-            score = s[s > threshold]
+            keep = scores > threshold
-            label = l[s > threshold]
+            scores = scores[keep]
-            box = b[s > threshold]
+            labels = labels[keep]
-            results.append({"scores": score, "labels": label, "boxes": box})
+            boxes = boxes[keep]
            results.append({"scores": scores, "labels": labels, "boxes": boxes})
        return results
@@ -574,13 +602,7 @@ class Owlv2ImageProcessor(BaseImageProcessor):
        # Convert from relative [0, 1] to absolute [0, height] coordinates
        if target_sizes is not None:
-            if isinstance(target_sizes, List):
+            target_boxes = _scale_boxes(target_boxes, target_sizes)
                img_h = torch.tensor([i[0] for i in target_sizes])
                img_w = torch.tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
            target_boxes = target_boxes * scale_fct[:, None, :]
        # Compute box display alphas based on prediction scores
        results = []
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -1749,33 +1749,30 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, Owlv2ForObjectDetection
-        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
+        >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection
        >>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = [["a photo of a cat", "a photo of a dog"]]
+        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
-        >>> inputs = processor(text=texts, images=image, return_tensors="pt")
+        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
-        >>> # forward pass
+        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-        >>> with torch.no_grad():
+        >>> target_sizes = torch.tensor([(image.height, image.width)])
-        ...     outputs = model(**inputs)
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
-
+        >>> results = processor.post_process_grounded_object_detection(
-        >>> target_sizes = torch.Tensor([image.size[::-1]])
+        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
        >>> results = processor.post_process_object_detection(
        ...     outputs=outputs, threshold=0.2, target_sizes=target_sizes
        ... )
-
+        >>> # Retrieve predictions for the first image for the corresponding text queries
-        >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
+        >>> result = results[0]
-        >>> text = texts[i]
+        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
-        >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+        >>> for box, score, text_label in zip(boxes, scores, text_labels):
        >>> for box, score, label in zip(boxes, scores, labels):
        ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
        Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
        ```"""
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -16,13 +16,18 @@
 Image/Text processor class for OWLv2
 """
-from typing import List
+import warnings
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 import numpy as np
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import is_flax_available, is_tf_available, is_torch_available
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
 if TYPE_CHECKING:
    from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput
 class Owlv2Processor(ProcessorMixin):
@@ -45,7 +50,7 @@ class Owlv2Processor(ProcessorMixin):
    def __init__(self, image_processor, tokenizer, **kwargs):
        super().__init__(image_processor, tokenizer)
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
    def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
        """
        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
@@ -157,21 +162,101 @@ class Owlv2Processor(ProcessorMixin):
        else:
            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2
    def post_process_object_detection(self, *args, **kwargs):
        """
-        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
+        This method forwards all its arguments to [`Owlv2ImageProcessor.post_process_object_detection`]. Please refer
        to the docstring of this method for more information.
        """
        warnings.warn(
            "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
            "Use `post_process_grounded_object_detection` instead.",
            FutureWarning,
        )
        return self.image_processor.post_process_object_detection(*args, **kwargs)
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_grounded_object_detection with OwlViT->Owlv2
-    def post_process_image_guided_detection(self, *args, **kwargs):
+    def post_process_grounded_object_detection(
        self,
        outputs: "Owlv2ObjectDetectionOutput",
        threshold: float = 0.1,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
        text_labels: Optional[List[List[str]]] = None,
    ):
        """
-        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
+        Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
-        Please refer to the docstring of this method for more information.
+        bottom_right_x, bottom_right_y) format.
        Args:
            outputs ([`Owlv2ObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.1):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
            text_labels (`List[List[str]]`, *optional*):
                List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
                set to `None`.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
            - "scores": The confidence scores for each predicted box on the image.
            - "labels": Indexes of the classes predicted by the model on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
            - "text_labels": The text labels for each predicted bounding box on the image.
        """
-        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
+        output = self.image_processor.post_process_object_detection(
            outputs=outputs, threshold=threshold, target_sizes=target_sizes
        )
        if text_labels is not None and len(text_labels) != len(output):
            raise ValueError("Make sure that you pass in as many lists of text labels as images")
        # adding text labels to the output
        if text_labels is not None:
            for image_output, image_text_labels in zip(output, text_labels):
                object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
                image_output["text_labels"] = object_text_labels
        else:
            for image_output in output:
                image_output["text_labels"] = None
        return output
    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OwlViT->Owlv2
    def post_process_image_guided_detection(
        self,
        outputs: "Owlv2ImageGuidedObjectDetectionOutput",
        threshold: float = 0.0,
        nms_threshold: float = 0.3,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
    ):
        """
        Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO
        api.
        Args:
            outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.0):
                Minimum confidence threshold to use to filter out predicted boxes.
            nms_threshold (`float`, *optional*, defaults to 0.3):
                IoU threshold for non-maximum suppression of overlapping boxes.
            target_sizes (`torch.Tensor`, *optional*):
                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
                None, predictions will not be unnormalized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
            - "scores": The confidence scores for each predicted box on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
            - "labels": Set to `None`.
        """
        return self.image_processor.post_process_image_guided_detection(
            outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
        )
    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -15,7 +15,7 @@
 """Image processor class for OwlViT"""
 import warnings
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -43,6 +43,9 @@ from ...image_utils import (
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging
 if TYPE_CHECKING:
    from .modeling_owlvit import OwlViTObjectDetectionOutput
 if is_torch_available():
    import torch
@@ -58,6 +61,34 @@ def _upcast(t):
        return t if t.dtype in (torch.int32, torch.int64) else t.int()
 def _scale_boxes(boxes, target_sizes):
    """
    Scale batch of bounding boxes to the target sizes.
    Args:
        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
        target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
    Returns:
        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
    """
    if isinstance(target_sizes, (list, tuple)):
        image_height = torch.tensor([i[0] for i in target_sizes])
        image_width = torch.tensor([i[1] for i in target_sizes])
    elif isinstance(target_sizes, torch.Tensor):
        image_height, image_width = target_sizes.unbind(1)
    else:
        raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
    boxes = boxes * scale_factor
    return boxes
 def box_area(boxes):
    """
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -459,7 +490,10 @@ class OwlViTImageProcessor(BaseImageProcessor):
        return results
    def post_process_object_detection(
-        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
        outputs: "OwlViTObjectDetectionOutput",
        threshold: float = 0.1,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
    ):
        """
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
@@ -468,52 +502,46 @@ class OwlViTImageProcessor(BaseImageProcessor):
        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
-            threshold (`float`, *optional*):
+            threshold (`float`, *optional*, defaults to 0.1):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
-            in the batch as predicted by the model.
+            - "scores": The confidence scores for each predicted box on the image.
            - "labels": Indexes of the classes predicted by the model on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
        """
-        # TODO: (amy) add support for other frameworks
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
-        logits, boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
-        if target_sizes is not None:
+        if target_sizes is not None and len(target_sizes) != batch_size:
-            if len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as images")
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )
-        probs = torch.max(logits, dim=-1)
+        # batch_logits of shape (batch_size, num_queries, num_classes)
-        scores = torch.sigmoid(probs.values)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
-        labels = probs.indices
+        batch_scores = torch.sigmoid(batch_class_logits.values)
        batch_labels = batch_class_logits.indices
        # Convert to [x0, y0, x1, y1] format
-        boxes = center_to_corners_format(boxes)
+        batch_boxes = center_to_corners_format(batch_boxes)
        # Convert from relative [0, 1] to absolute [0, height] coordinates
        if target_sizes is not None:
-            if isinstance(target_sizes, List):
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]
        results = []
-        for s, l, b in zip(scores, labels, boxes):
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
-            score = s[s > threshold]
+            keep = scores > threshold
-            label = l[s > threshold]
+            scores = scores[keep]
-            box = b[s > threshold]
+            labels = labels[keep]
-            results.append({"scores": score, "labels": label, "boxes": box})
+            boxes = boxes[keep]
            results.append({"scores": scores, "labels": labels, "boxes": boxes})
        return results
    # TODO: (Amy) Make compatible with other frameworks
    def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
        """
        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
@@ -562,13 +590,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
        # Convert from relative [0, 1] to absolute [0, height] coordinates
        if target_sizes is not None:
-            if isinstance(target_sizes, List):
+            target_boxes = _scale_boxes(target_boxes, target_sizes)
                img_h = torch.tensor([i[0] for i in target_sizes])
                img_w = torch.tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
            target_boxes = target_boxes * scale_fct[:, None, :]
        # Compute box display alphas based on prediction scores
        results = []
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1689,31 +1689,30 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTForObjectDetection
-        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = [["a photo of a cat", "a photo of a dog"]]
+        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
-        >>> inputs = processor(text=texts, images=image, return_tensors="pt")
+        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-        >>> target_sizes = torch.Tensor([image.size[::-1]])
+        >>> target_sizes = torch.tensor([(image.height, image.width)])
-        >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
-        >>> results = processor.post_process_object_detection(
+        >>> results = processor.post_process_grounded_object_detection(
-        ...     outputs=outputs, threshold=0.1, target_sizes=target_sizes
+        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        ... )
-
+        >>> # Retrieve predictions for the first image for the corresponding text queries
-        >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
+        >>> result = results[0]
-        >>> text = texts[i]
+        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
-        >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+        >>> for box, score, text_label in zip(boxes, scores, text_labels):
        >>> for box, score, label in zip(boxes, scores, labels):
        ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
        Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
        ```"""
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -17,13 +17,17 @@ Image/Text processor class for OWL-ViT
 """
 import warnings
-from typing import List
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 import numpy as np
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import is_flax_available, is_tf_available, is_torch_available
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
 if TYPE_CHECKING:
    from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput
 class OwlViTProcessor(ProcessorMixin):
@@ -184,14 +188,93 @@ class OwlViTProcessor(ProcessorMixin):
        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
        to the docstring of this method for more information.
        """
        warnings.warn(
            "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
            "Use `post_process_grounded_object_detection` instead.",
            FutureWarning,
        )
        return self.image_processor.post_process_object_detection(*args, **kwargs)
-    def post_process_image_guided_detection(self, *args, **kwargs):
+    def post_process_grounded_object_detection(
        self,
        outputs: "OwlViTObjectDetectionOutput",
        threshold: float = 0.1,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
        text_labels: Optional[List[List[str]]] = None,
    ):
        """
-        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
-        Please refer to the docstring of this method for more information.
+        bottom_right_x, bottom_right_y) format.
        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.1):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
            text_labels (`List[List[str]]`, *optional*):
                List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
                set to `None`.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
            - "scores": The confidence scores for each predicted box on the image.
            - "labels": Indexes of the classes predicted by the model on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
            - "text_labels": The text labels for each predicted bounding box on the image.
        """
-        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
+        output = self.image_processor.post_process_object_detection(
            outputs=outputs, threshold=threshold, target_sizes=target_sizes
        )
        if text_labels is not None and len(text_labels) != len(output):
            raise ValueError("Make sure that you pass in as many lists of text labels as images")
        # adding text labels to the output
        if text_labels is not None:
            for image_output, image_text_labels in zip(output, text_labels):
                object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
                image_output["text_labels"] = object_text_labels
        else:
            for image_output in output:
                image_output["text_labels"] = None
        return output
    def post_process_image_guided_detection(
        self,
        outputs: "OwlViTImageGuidedObjectDetectionOutput",
        threshold: float = 0.0,
        nms_threshold: float = 0.3,
        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
    ):
        """
        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
        api.
        Args:
            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.0):
                Minimum confidence threshold to use to filter out predicted boxes.
            nms_threshold (`float`, *optional*, defaults to 0.3):
                IoU threshold for non-maximum suppression of overlapping boxes.
            target_sizes (`torch.Tensor`, *optional*):
                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
                None, predictions will not be unnormalized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
            - "scores": The confidence scores for each predicted box on the image.
            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
            - "labels": Set to `None`.
        """
        return self.image_processor.post_process_image_guided_detection(
            outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
        )
    def batch_decode(self, *args, **kwargs):
        """
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -974,8 +974,9 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
        processor = OwlViTProcessor.from_pretrained(model_name)
        image = prepare_img()
        text_labels = [["a photo of a cat", "a photo of a dog"]]
        inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
+            text=text_labels,
            images=image,
            max_length=16,
            padding="max_length",
@@ -991,11 +992,31 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
        expected_slice_logits = torch.tensor(
            [[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
        ).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+        resulted_slice_logits = outputs.logits[0, :3, :3]
        max_diff = torch.max(torch.abs(resulted_slice_logits - expected_slice_logits)).item()
        self.assertLess(max_diff, 3e-4)
        expected_slice_boxes = torch.tensor(
            [[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
        ).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+        resulted_slice_boxes = outputs.pred_boxes[0, :3, :3]
        max_diff = torch.max(torch.abs(resulted_slice_boxes - expected_slice_boxes)).item()
        self.assertLess(max_diff, 3e-4)
        # test post-processing
        post_processed_output = processor.post_process_grounded_object_detection(outputs)
        self.assertIsNone(post_processed_output[0]["text_labels"])
        post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
            outputs, text_labels=text_labels
        )
        objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
        self.assertListEqual(objects_labels, [0, 0])
        objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
        self.assertIsNotNone(objects_text_labels)
        self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
    @slow
    def test_inference_one_shot_object_detection(self):
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -967,8 +967,9 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
        processor = OwlViTProcessor.from_pretrained(model_name)
        image = prepare_img()
        text_labels = [["a photo of a cat", "a photo of a dog"]]
        inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
+            text=text_labels,
            images=image,
            max_length=16,
            padding="max_length",
@@ -986,6 +987,21 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
        ).to(torch_device)
        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
        # test post-processing
        post_processed_output = processor.post_process_grounded_object_detection(outputs)
        self.assertIsNone(post_processed_output[0]["text_labels"])
        post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
            outputs, text_labels=text_labels
        )
        objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
        self.assertListEqual(objects_labels, [0, 0])
        objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
        self.assertIsNotNone(objects_text_labels)
        self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
    @slow
    def test_inference_one_shot_object_detection(self):
        model_name = "google/owlvit-base-patch32"
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -973,6 +973,7 @@ DEPRECATED_OBJECTS = [
    "xnli_processors",
    "xnli_tasks_num_labels",
    "TFTrainingArguments",
    "OwlViTFeatureExtractor",
 ]
 # Exceptionally, some objects should not be documented after all rules passed.