diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md
index 1b4e92bc4e..696a1b0377 100644
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@@ -50,20 +50,22 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
 
 >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
->>> texts = [["a photo of a cat", "a photo of a dog"]]
->>> inputs = processor(text=texts, images=image, return_tensors="pt")
+>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
+>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
 >>> outputs = model(**inputs)
 
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
->>> target_sizes = torch.Tensor([image.size[::-1]])
->>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
->>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
->>> text = texts[i]
->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
->>> for box, score, label in zip(boxes, scores, labels):
+>>> target_sizes = torch.tensor([(image.height, image.width)])
+>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+>>> results = processor.post_process_grounded_object_detection(
+...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
+... )
+>>> # Retrieve predictions for the first image for the corresponding text queries
+>>> result = results[0]
+>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
+>>> for box, score, text_label in zip(boxes, scores, text_labels):
 ...     box = [round(i, 2) for i in box.tolist()]
-...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
 Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
 Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
 ```
@@ -103,6 +105,9 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
 ## Owlv2Processor
 
 [[autodoc]] Owlv2Processor
+    - __call__
+    - post_process_grounded_object_detection
+    - post_process_image_guided_detection
 
 ## Owlv2Model
 
diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
index c40d3a9e7a..519648bbd8 100644
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -49,20 +49,22 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
 
 >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
->>> texts = [["a photo of a cat", "a photo of a dog"]]
->>> inputs = processor(text=texts, images=image, return_tensors="pt")
+>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
+>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
 >>> outputs = model(**inputs)
 
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
->>> target_sizes = torch.Tensor([image.size[::-1]])
+>>> target_sizes = torch.tensor([(image.height, image.width)])
 >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
->>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
->>> text = texts[i]
->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
->>> for box, score, label in zip(boxes, scores, labels):
+>>> results = processor.post_process_grounded_object_detection(
+...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
+... )
+>>> # Retrieve predictions for the first image for the corresponding text queries
+>>> result = results[0]
+>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
+>>> for box, score, text_label in zip(boxes, scores, text_labels):
 ...     box = [round(i, 2) for i in box.tolist()]
-...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
 Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
 Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
 ```
@@ -91,16 +93,12 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de
     - post_process_object_detection
     - post_process_image_guided_detection
 
-## OwlViTFeatureExtractor
-
-[[autodoc]] OwlViTFeatureExtractor
-    - __call__
-    - post_process
-    - post_process_image_guided_detection
-
 ## OwlViTProcessor
 
 [[autodoc]] OwlViTProcessor
+    - __call__
+    - post_process_grounded_object_detection
+    - post_process_image_guided_detection
 
 ## OwlViTModel
 
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index f3d2117ca8..e3c99568cd 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -17,7 +17,7 @@
 import io
 import pathlib
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
@@ -77,6 +77,9 @@ if is_scipy_available():
     import scipy.special
     import scipy.stats
 
+if TYPE_CHECKING:
+    from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -806,6 +809,35 @@ def compute_segments(
     return segmentation, segments
 
 
+# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
 class GroundingDinoImageProcessor(BaseImageProcessor):
     r"""
     Constructs a Grounding DINO image processor.
@@ -1533,7 +1565,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
 
     # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
+        outputs: "GroundingDinoObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
     ):
         """
         Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
@@ -1542,48 +1577,43 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
         Args:
             outputs ([`GroundingDinoObjectDetectionOutput`]):
                 Raw outputs of the model.
-            threshold (`float`, *optional*):
+            threshold (`float`, *optional*, defaults to 0.1):
                 Score threshold to keep object detection predictions.
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
         Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
         """
-        # TODO: (amy) add support for other frameworks
-        logits, boxes = outputs.logits, outputs.pred_boxes
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
 
-        if target_sizes is not None:
-            if len(logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
 
-        probs = torch.max(logits, dim=-1)
-        scores = torch.sigmoid(probs.values)
-        labels = probs.indices
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
 
         # Convert to [x0, y0, x1, y1] format
-        boxes = center_to_corners_format(boxes)
+        batch_boxes = center_to_corners_format(batch_boxes)
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
-            if isinstance(target_sizes, List):
-                img_h = torch.Tensor([i[0] for i in target_sizes])
-                img_w = torch.Tensor([i[1] for i in target_sizes])
-            else:
-                img_h, img_w = target_sizes.unbind(1)
-
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
-            boxes = boxes * scale_fct[:, None, :]
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
 
         results = []
-        for s, l, b in zip(scores, labels, boxes):
-            score = s[s > threshold]
-            label = l[s > threshold]
-            box = b[s > threshold]
-            results.append({"scores": score, "labels": label, "boxes": box})
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
 
         return results
 
diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index ac637d62dd..1dfdfbd1c2 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -15,7 +15,7 @@
 """Image processor class for OWLv2."""
 
 import warnings
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -60,10 +60,43 @@ if is_vision_available():
 if is_scipy_available():
     from scipy import ndimage as ndi
 
+if TYPE_CHECKING:
+    from .modeling_owlv2 import Owlv2ObjectDetectionOutput
 
 logger = logging.get_logger(__name__)
 
 
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    # for owlv2 image is padded to max size unlike owlvit, thats why we have to scale boxes to max size
+    max_size = torch.max(image_height, image_width)
+
+    scale_factor = torch.stack([max_size, max_size, max_size, max_size], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
 # Copied from transformers.models.owlvit.image_processing_owlvit._upcast
 def _upcast(t):
     # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
@@ -466,62 +499,57 @@ class Owlv2ImageProcessor(BaseImageProcessor):
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->Owlv2
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
+        outputs: "Owlv2ObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
     ):
         """
-        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
         bottom_right_x, bottom_right_y) format.
 
         Args:
-            outputs ([`OwlViTObjectDetectionOutput`]):
+            outputs ([`Owlv2ObjectDetectionOutput`]):
                 Raw outputs of the model.
-            threshold (`float`, *optional*):
+            threshold (`float`, *optional*, defaults to 0.1):
                 Score threshold to keep object detection predictions.
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
         Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
         """
-        # TODO: (amy) add support for other frameworks
-        logits, boxes = outputs.logits, outputs.pred_boxes
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
 
-        if target_sizes is not None:
-            if len(logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
 
-        probs = torch.max(logits, dim=-1)
-        scores = torch.sigmoid(probs.values)
-        labels = probs.indices
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
 
         # Convert to [x0, y0, x1, y1] format
-        boxes = center_to_corners_format(boxes)
+        batch_boxes = center_to_corners_format(batch_boxes)
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
-            if isinstance(target_sizes, List):
-                img_h = torch.Tensor([i[0] for i in target_sizes])
-                img_w = torch.Tensor([i[1] for i in target_sizes])
-            else:
-                img_h, img_w = target_sizes.unbind(1)
-
-            # Rescale coordinates, image is padded to square for inference,
-            # that is why we need to scale boxes to the max size
-            size = torch.max(img_h, img_w)
-            scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)
-
-            boxes = boxes * scale_fct[:, None, :]
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
 
         results = []
-        for s, l, b in zip(scores, labels, boxes):
-            score = s[s > threshold]
-            label = l[s > threshold]
-            box = b[s > threshold]
-            results.append({"scores": score, "labels": label, "boxes": box})
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
 
         return results
 
@@ -574,13 +602,7 @@ class Owlv2ImageProcessor(BaseImageProcessor):
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
-            if isinstance(target_sizes, List):
-                img_h = torch.tensor([i[0] for i in target_sizes])
-                img_w = torch.tensor([i[1] for i in target_sizes])
-            else:
-                img_h, img_w = target_sizes.unbind(1)
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
-            target_boxes = target_boxes * scale_fct[:, None, :]
+            target_boxes = _scale_boxes(target_boxes, target_sizes)
 
         # Compute box display alphas based on prediction scores
         results = []
diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
index d8a46d0b0f..d69bcaa87f 100644
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -1749,33 +1749,30 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
         >>> import requests
         >>> from PIL import Image
         >>> import torch
-        >>> from transformers import AutoProcessor, Owlv2ForObjectDetection
 
-        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
+        >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection
+
+        >>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
         >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = [["a photo of a cat", "a photo of a dog"]]
-        >>> inputs = processor(text=texts, images=image, return_tensors="pt")
+        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
+        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
 
-        >>> # forward pass
-        >>> with torch.no_grad():
-        ...     outputs = model(**inputs)
-
-        >>> target_sizes = torch.Tensor([image.size[::-1]])
-        >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
-        >>> results = processor.post_process_object_detection(
-        ...     outputs=outputs, threshold=0.2, target_sizes=target_sizes
+        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+        >>> target_sizes = torch.tensor([(image.height, image.width)])
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> results = processor.post_process_grounded_object_detection(
+        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
         ... )
-
-        >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
-        >>> text = texts[i]
-        >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-
-        >>> for box, score, label in zip(boxes, scores, labels):
+        >>> # Retrieve predictions for the first image for the corresponding text queries
+        >>> result = results[0]
+        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
+        >>> for box, score, text_label in zip(boxes, scores, text_labels):
         ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
         Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
         Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
         ```"""
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 4a0b5a712e..b79ab626f7 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -16,13 +16,18 @@
 Image/Text processor class for OWLv2
 """
 
-from typing import List
+import warnings
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
 
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import is_flax_available, is_tf_available, is_torch_available
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
+
+
+if TYPE_CHECKING:
+    from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput
 
 
 class Owlv2Processor(ProcessorMixin):
@@ -45,7 +50,7 @@ class Owlv2Processor(ProcessorMixin):
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor, tokenizer)
 
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
     def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
         """
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
@@ -157,21 +162,101 @@ class Owlv2Processor(ProcessorMixin):
         else:
             return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
 
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2
     def post_process_object_detection(self, *args, **kwargs):
         """
-        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
+        This method forwards all its arguments to [`Owlv2ImageProcessor.post_process_object_detection`]. Please refer
         to the docstring of this method for more information.
         """
+        warnings.warn(
+            "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
+            "Use `post_process_grounded_object_detection` instead.",
+            FutureWarning,
+        )
         return self.image_processor.post_process_object_detection(*args, **kwargs)
 
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2
-    def post_process_image_guided_detection(self, *args, **kwargs):
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_grounded_object_detection with OwlViT->Owlv2
+    def post_process_grounded_object_detection(
+        self,
+        outputs: "Owlv2ObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
+        text_labels: Optional[List[List[str]]] = None,
+    ):
         """
-        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
-        Please refer to the docstring of this method for more information.
+        Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`Owlv2ObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            text_labels (`List[List[str]]`, *optional*):
+                List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
+                set to `None`.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+            - "text_labels": The text labels for each predicted bounding box on the image.
         """
-        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
+        output = self.image_processor.post_process_object_detection(
+            outputs=outputs, threshold=threshold, target_sizes=target_sizes
+        )
+
+        if text_labels is not None and len(text_labels) != len(output):
+            raise ValueError("Make sure that you pass in as many lists of text labels as images")
+
+        # adding text labels to the output
+        if text_labels is not None:
+            for image_output, image_text_labels in zip(output, text_labels):
+                object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
+                image_output["text_labels"] = object_text_labels
+        else:
+            for image_output in output:
+                image_output["text_labels"] = None
+
+        return output
+
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OwlViT->Owlv2
+    def post_process_image_guided_detection(
+        self,
+        outputs: "Owlv2ImageGuidedObjectDetectionOutput",
+        threshold: float = 0.0,
+        nms_threshold: float = 0.3,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
+    ):
+        """
+        Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO
+        api.
+
+        Args:
+            outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.0):
+                Minimum confidence threshold to use to filter out predicted boxes.
+            nms_threshold (`float`, *optional*, defaults to 0.3):
+                IoU threshold for non-maximum suppression of overlapping boxes.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
+                None, predictions will not be unnormalized.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+            - "labels": Set to `None`.
+        """
+        return self.image_processor.post_process_image_guided_detection(
+            outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
+        )
 
     # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 05c2fb908c..59c6465734 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -15,7 +15,7 @@
 """Image processor class for OwlViT"""
 
 import warnings
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -43,6 +43,9 @@ from ...image_utils import (
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging
 
 
+if TYPE_CHECKING:
+    from .modeling_owlvit import OwlViTObjectDetectionOutput
+
 if is_torch_available():
     import torch
 
@@ -58,6 +61,34 @@ def _upcast(t):
         return t if t.dtype in (torch.int32, torch.int64) else t.int()
 
 
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
 def box_area(boxes):
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -459,7 +490,10 @@ class OwlViTImageProcessor(BaseImageProcessor):
         return results
 
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
+        self,
+        outputs: "OwlViTObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
     ):
         """
         Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
@@ -468,52 +502,46 @@ class OwlViTImageProcessor(BaseImageProcessor):
         Args:
             outputs ([`OwlViTObjectDetectionOutput`]):
                 Raw outputs of the model.
-            threshold (`float`, *optional*):
+            threshold (`float`, *optional*, defaults to 0.1):
                 Score threshold to keep object detection predictions.
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
         Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            in the batch as predicted by the model.
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
         """
-        # TODO: (amy) add support for other frameworks
-        logits, boxes = outputs.logits, outputs.pred_boxes
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
 
-        if target_sizes is not None:
-            if len(logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
 
-        probs = torch.max(logits, dim=-1)
-        scores = torch.sigmoid(probs.values)
-        labels = probs.indices
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
 
         # Convert to [x0, y0, x1, y1] format
-        boxes = center_to_corners_format(boxes)
+        batch_boxes = center_to_corners_format(batch_boxes)
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
-            if isinstance(target_sizes, List):
-                img_h = torch.Tensor([i[0] for i in target_sizes])
-                img_w = torch.Tensor([i[1] for i in target_sizes])
-            else:
-                img_h, img_w = target_sizes.unbind(1)
-
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
-            boxes = boxes * scale_fct[:, None, :]
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
 
         results = []
-        for s, l, b in zip(scores, labels, boxes):
-            score = s[s > threshold]
-            label = l[s > threshold]
-            box = b[s > threshold]
-            results.append({"scores": score, "labels": label, "boxes": box})
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
 
         return results
 
-    # TODO: (Amy) Make compatible with other frameworks
     def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
         """
         Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
@@ -562,13 +590,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
-            if isinstance(target_sizes, List):
-                img_h = torch.tensor([i[0] for i in target_sizes])
-                img_w = torch.tensor([i[1] for i in target_sizes])
-            else:
-                img_h, img_w = target_sizes.unbind(1)
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
-            target_boxes = target_boxes * scale_fct[:, None, :]
+            target_boxes = _scale_boxes(target_boxes, target_sizes)
 
         # Compute box display alphas based on prediction scores
         results = []
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 14f41fafac..d9c0e72409 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1689,31 +1689,30 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
         >>> import requests
         >>> from PIL import Image
         >>> import torch
-        >>> from transformers import AutoProcessor, OwlViTForObjectDetection
 
-        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
+
+        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = [["a photo of a cat", "a photo of a dog"]]
-        >>> inputs = processor(text=texts, images=image, return_tensors="pt")
+        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
+        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
 
         >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-        >>> target_sizes = torch.Tensor([image.size[::-1]])
-        >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
-        >>> results = processor.post_process_object_detection(
-        ...     outputs=outputs, threshold=0.1, target_sizes=target_sizes
+        >>> target_sizes = torch.tensor([(image.height, image.width)])
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> results = processor.post_process_grounded_object_detection(
+        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
         ... )
-
-        >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
-        >>> text = texts[i]
-        >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-
-        >>> for box, score, label in zip(boxes, scores, labels):
+        >>> # Retrieve predictions for the first image for the corresponding text queries
+        >>> result = results[0]
+        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
+        >>> for box, score, text_label in zip(boxes, scores, text_labels):
         ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
         Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
         Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
         ```"""
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 49e913a384..dd74da5546 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -17,13 +17,17 @@ Image/Text processor class for OWL-ViT
 """
 
 import warnings
-from typing import List
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
 
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import is_flax_available, is_tf_available, is_torch_available
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
+
+
+if TYPE_CHECKING:
+    from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput
 
 
 class OwlViTProcessor(ProcessorMixin):
@@ -184,14 +188,93 @@ class OwlViTProcessor(ProcessorMixin):
         This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
         to the docstring of this method for more information.
         """
+        warnings.warn(
+            "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
+            "Use `post_process_grounded_object_detection` instead.",
+            FutureWarning,
+        )
         return self.image_processor.post_process_object_detection(*args, **kwargs)
 
-    def post_process_image_guided_detection(self, *args, **kwargs):
+    def post_process_grounded_object_detection(
+        self,
+        outputs: "OwlViTObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
+        text_labels: Optional[List[List[str]]] = None,
+    ):
         """
-        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
-        Please refer to the docstring of this method for more information.
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            text_labels (`List[List[str]]`, *optional*):
+                List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
+                set to `None`.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+            - "text_labels": The text labels for each predicted bounding box on the image.
         """
-        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
+        output = self.image_processor.post_process_object_detection(
+            outputs=outputs, threshold=threshold, target_sizes=target_sizes
+        )
+
+        if text_labels is not None and len(text_labels) != len(output):
+            raise ValueError("Make sure that you pass in as many lists of text labels as images")
+
+        # adding text labels to the output
+        if text_labels is not None:
+            for image_output, image_text_labels in zip(output, text_labels):
+                object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
+                image_output["text_labels"] = object_text_labels
+        else:
+            for image_output in output:
+                image_output["text_labels"] = None
+
+        return output
+
+    def post_process_image_guided_detection(
+        self,
+        outputs: "OwlViTImageGuidedObjectDetectionOutput",
+        threshold: float = 0.0,
+        nms_threshold: float = 0.3,
+        target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
+    ):
+        """
+        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
+        api.
+
+        Args:
+            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.0):
+                Minimum confidence threshold to use to filter out predicted boxes.
+            nms_threshold (`float`, *optional*, defaults to 0.3):
+                IoU threshold for non-maximum suppression of overlapping boxes.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
+                None, predictions will not be unnormalized.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+            - "labels": Set to `None`.
+        """
+        return self.image_processor.post_process_image_guided_detection(
+            outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
+        )
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index b35f58e99a..cab47c2b5e 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -974,8 +974,9 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
         processor = OwlViTProcessor.from_pretrained(model_name)
 
         image = prepare_img()
+        text_labels = [["a photo of a cat", "a photo of a dog"]]
         inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
+            text=text_labels,
             images=image,
             max_length=16,
             padding="max_length",
@@ -991,11 +992,31 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
         expected_slice_logits = torch.tensor(
             [[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
         ).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+        resulted_slice_logits = outputs.logits[0, :3, :3]
+        max_diff = torch.max(torch.abs(resulted_slice_logits - expected_slice_logits)).item()
+        self.assertLess(max_diff, 3e-4)
+
         expected_slice_boxes = torch.tensor(
             [[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
         ).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+        resulted_slice_boxes = outputs.pred_boxes[0, :3, :3]
+        max_diff = torch.max(torch.abs(resulted_slice_boxes - expected_slice_boxes)).item()
+        self.assertLess(max_diff, 3e-4)
+
+        # test post-processing
+        post_processed_output = processor.post_process_grounded_object_detection(outputs)
+        self.assertIsNone(post_processed_output[0]["text_labels"])
+
+        post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
+            outputs, text_labels=text_labels
+        )
+
+        objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
+        self.assertListEqual(objects_labels, [0, 0])
+
+        objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
+        self.assertIsNotNone(objects_text_labels)
+        self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
 
     @slow
     def test_inference_one_shot_object_detection(self):
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 545fee0c4f..d207135a58 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -967,8 +967,9 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
         processor = OwlViTProcessor.from_pretrained(model_name)
 
         image = prepare_img()
+        text_labels = [["a photo of a cat", "a photo of a dog"]]
         inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
+            text=text_labels,
             images=image,
             max_length=16,
             padding="max_length",
@@ -986,6 +987,21 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
+        # test post-processing
+        post_processed_output = processor.post_process_grounded_object_detection(outputs)
+        self.assertIsNone(post_processed_output[0]["text_labels"])
+
+        post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
+            outputs, text_labels=text_labels
+        )
+
+        objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
+        self.assertListEqual(objects_labels, [0, 0])
+
+        objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
+        self.assertIsNotNone(objects_text_labels)
+        self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
+
     @slow
     def test_inference_one_shot_object_detection(self):
         model_name = "google/owlvit-base-patch32"
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d35bf27420..b4aed44062 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -973,6 +973,7 @@ DEPRECATED_OBJECTS = [
     "xnli_processors",
     "xnli_tasks_num_labels",
     "TFTrainingArguments",
+    "OwlViTFeatureExtractor",
 ]
 
 # Exceptionally, some objects should not be documented after all rules passed.