OwlViT/Owlv2 post processing standardization (#34929)

* Refactor owlvit post_process_object_detection + add text_labels

* Fix copies in grounding dino

* Sync with Owlv2 postprocessing

* Add post_process_grounded_object_detection method to processor, deprecate post_process_object_detection

* Add test cases

* Move text_labels to processors only

* [run-slow] owlvit owlv2

* [run-slow] owlvit, owlv2

* Update snippets

* Update docs structure

* Update deprecated objects for check_repo

* Update docstring for post processing of image guided object detection
This commit is contained in:
Pavel Iakubovskii
2025-01-17 13:58:28 +00:00
committed by GitHub
parent add5f0566c
commit 94ae9a8da1
12 changed files with 467 additions and 188 deletions

View File

@@ -50,20 +50,22 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = [["a photo of a cat", "a photo of a dog"]] >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=texts, images=image, return_tensors="pt") >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([(image.height, image.width)])
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax) >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) >>> results = processor.post_process_grounded_object_detection(
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
>>> text = texts[i] ... )
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] >>> # Retrieve predictions for the first image for the corresponding text queries
>>> for box, score, label in zip(boxes, scores, labels): >>> result = results[0]
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
>>> for box, score, text_label in zip(boxes, scores, text_labels):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35] Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13] Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
``` ```
@@ -103,6 +105,9 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
## Owlv2Processor ## Owlv2Processor
[[autodoc]] Owlv2Processor [[autodoc]] Owlv2Processor
- __call__
- post_process_grounded_object_detection
- post_process_image_guided_detection
## Owlv2Model ## Owlv2Model

View File

@@ -49,20 +49,22 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = [["a photo of a cat", "a photo of a dog"]] >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=texts, images=image, return_tensors="pt") >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([(image.height, image.width)])
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) >>> results = processor.post_process_grounded_object_detection(
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
>>> text = texts[i] ... )
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] >>> # Retrieve predictions for the first image for the corresponding text queries
>>> for box, score, label in zip(boxes, scores, labels): >>> result = results[0]
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
>>> for box, score, text_label in zip(boxes, scores, text_labels):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
``` ```
@@ -91,16 +93,12 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de
- post_process_object_detection - post_process_object_detection
- post_process_image_guided_detection - post_process_image_guided_detection
## OwlViTFeatureExtractor
[[autodoc]] OwlViTFeatureExtractor
- __call__
- post_process
- post_process_image_guided_detection
## OwlViTProcessor ## OwlViTProcessor
[[autodoc]] OwlViTProcessor [[autodoc]] OwlViTProcessor
- __call__
- post_process_grounded_object_detection
- post_process_image_guided_detection
## OwlViTModel ## OwlViTModel

View File

@@ -17,7 +17,7 @@
import io import io
import pathlib import pathlib
from collections import defaultdict from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np import numpy as np
@@ -77,6 +77,9 @@ if is_scipy_available():
import scipy.special import scipy.special
import scipy.stats import scipy.stats
if TYPE_CHECKING:
from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -806,6 +809,35 @@ def compute_segments(
return segmentation, segments return segmentation, segments
# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes
def _scale_boxes(boxes, target_sizes):
"""
Scale batch of bounding boxes to the target sizes.
Args:
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
Returns:
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
"""
if isinstance(target_sizes, (list, tuple)):
image_height = torch.tensor([i[0] for i in target_sizes])
image_width = torch.tensor([i[1] for i in target_sizes])
elif isinstance(target_sizes, torch.Tensor):
image_height, image_width = target_sizes.unbind(1)
else:
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
boxes = boxes * scale_factor
return boxes
class GroundingDinoImageProcessor(BaseImageProcessor): class GroundingDinoImageProcessor(BaseImageProcessor):
r""" r"""
Constructs a Grounding DINO image processor. Constructs a Grounding DINO image processor.
@@ -1533,7 +1565,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
def post_process_object_detection( def post_process_object_detection(
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None self,
outputs: "GroundingDinoObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
): ):
""" """
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
@@ -1542,48 +1577,43 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
Args: Args:
outputs ([`GroundingDinoObjectDetectionOutput`]): outputs ([`GroundingDinoObjectDetectionOutput`]):
Raw outputs of the model. Raw outputs of the model.
threshold (`float`, *optional*): threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions. Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized. `(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
in the batch as predicted by the model. - "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
""" """
# TODO: (amy) add support for other frameworks batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
logits, boxes = outputs.logits, outputs.pred_boxes batch_size = len(batch_logits)
if target_sizes is not None: if target_sizes is not None and len(target_sizes) != batch_size:
if len(logits) != len(target_sizes): raise ValueError("Make sure that you pass in as many target sizes as images")
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
probs = torch.max(logits, dim=-1) # batch_logits of shape (batch_size, num_queries, num_classes)
scores = torch.sigmoid(probs.values) batch_class_logits = torch.max(batch_logits, dim=-1)
labels = probs.indices batch_scores = torch.sigmoid(batch_class_logits.values)
batch_labels = batch_class_logits.indices
# Convert to [x0, y0, x1, y1] format # Convert to [x0, y0, x1, y1] format
boxes = center_to_corners_format(boxes) batch_boxes = center_to_corners_format(batch_boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates # Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None: if target_sizes is not None:
if isinstance(target_sizes, List): batch_boxes = _scale_boxes(batch_boxes, target_sizes)
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = [] results = []
for s, l, b in zip(scores, labels, boxes): for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
score = s[s > threshold] keep = scores > threshold
label = l[s > threshold] scores = scores[keep]
box = b[s > threshold] labels = labels[keep]
results.append({"scores": score, "labels": label, "boxes": box}) boxes = boxes[keep]
results.append({"scores": scores, "labels": labels, "boxes": boxes})
return results return results

View File

@@ -15,7 +15,7 @@
"""Image processor class for OWLv2.""" """Image processor class for OWLv2."""
import warnings import warnings
from typing import Dict, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
@@ -60,10 +60,43 @@ if is_vision_available():
if is_scipy_available(): if is_scipy_available():
from scipy import ndimage as ndi from scipy import ndimage as ndi
if TYPE_CHECKING:
from .modeling_owlv2 import Owlv2ObjectDetectionOutput
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
def _scale_boxes(boxes, target_sizes):
"""
Scale batch of bounding boxes to the target sizes.
Args:
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
Returns:
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
"""
if isinstance(target_sizes, (list, tuple)):
image_height = torch.tensor([i[0] for i in target_sizes])
image_width = torch.tensor([i[1] for i in target_sizes])
elif isinstance(target_sizes, torch.Tensor):
image_height, image_width = target_sizes.unbind(1)
else:
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
# for owlv2 image is padded to max size unlike owlvit, thats why we have to scale boxes to max size
max_size = torch.max(image_height, image_width)
scale_factor = torch.stack([max_size, max_size, max_size, max_size], dim=1)
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
boxes = boxes * scale_factor
return boxes
# Copied from transformers.models.owlvit.image_processing_owlvit._upcast # Copied from transformers.models.owlvit.image_processing_owlvit._upcast
def _upcast(t): def _upcast(t):
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
@@ -466,62 +499,57 @@ class Owlv2ImageProcessor(BaseImageProcessor):
data = {"pixel_values": images} data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->Owlv2
def post_process_object_detection( def post_process_object_detection(
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None self,
outputs: "Owlv2ObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
): ):
""" """
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format. bottom_right_x, bottom_right_y) format.
Args: Args:
outputs ([`OwlViTObjectDetectionOutput`]): outputs ([`Owlv2ObjectDetectionOutput`]):
Raw outputs of the model. Raw outputs of the model.
threshold (`float`, *optional*): threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions. Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized. `(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
in the batch as predicted by the model. - "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
""" """
# TODO: (amy) add support for other frameworks batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
logits, boxes = outputs.logits, outputs.pred_boxes batch_size = len(batch_logits)
if target_sizes is not None: if target_sizes is not None and len(target_sizes) != batch_size:
if len(logits) != len(target_sizes): raise ValueError("Make sure that you pass in as many target sizes as images")
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
probs = torch.max(logits, dim=-1) # batch_logits of shape (batch_size, num_queries, num_classes)
scores = torch.sigmoid(probs.values) batch_class_logits = torch.max(batch_logits, dim=-1)
labels = probs.indices batch_scores = torch.sigmoid(batch_class_logits.values)
batch_labels = batch_class_logits.indices
# Convert to [x0, y0, x1, y1] format # Convert to [x0, y0, x1, y1] format
boxes = center_to_corners_format(boxes) batch_boxes = center_to_corners_format(batch_boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates # Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None: if target_sizes is not None:
if isinstance(target_sizes, List): batch_boxes = _scale_boxes(batch_boxes, target_sizes)
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
# Rescale coordinates, image is padded to square for inference,
# that is why we need to scale boxes to the max size
size = torch.max(img_h, img_w)
scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = [] results = []
for s, l, b in zip(scores, labels, boxes): for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
score = s[s > threshold] keep = scores > threshold
label = l[s > threshold] scores = scores[keep]
box = b[s > threshold] labels = labels[keep]
results.append({"scores": score, "labels": label, "boxes": box}) boxes = boxes[keep]
results.append({"scores": scores, "labels": labels, "boxes": boxes})
return results return results
@@ -574,13 +602,7 @@ class Owlv2ImageProcessor(BaseImageProcessor):
# Convert from relative [0, 1] to absolute [0, height] coordinates # Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None: if target_sizes is not None:
if isinstance(target_sizes, List): target_boxes = _scale_boxes(target_boxes, target_sizes)
img_h = torch.tensor([i[0] for i in target_sizes])
img_w = torch.tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
target_boxes = target_boxes * scale_fct[:, None, :]
# Compute box display alphas based on prediction scores # Compute box display alphas based on prediction scores
results = [] results = []

View File

@@ -1749,33 +1749,30 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
>>> import requests >>> import requests
>>> from PIL import Image >>> from PIL import Image
>>> import torch >>> import torch
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection
>>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble") >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = [["a photo of a cat", "a photo of a dog"]] >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=texts, images=image, return_tensors="pt") >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # forward pass >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> with torch.no_grad(): >>> target_sizes = torch.tensor([(image.height, image.width)])
... outputs = model(**inputs) >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_grounded_object_detection(
>>> target_sizes = torch.Tensor([image.size[::-1]]) ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
>>> results = processor.post_process_object_detection(
... outputs=outputs, threshold=0.2, target_sizes=target_sizes
... ) ... )
>>> # Retrieve predictions for the first image for the corresponding text queries
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> result = results[0]
>>> text = texts[i] >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] >>> for box, score, text_label in zip(boxes, scores, text_labels):
>>> for box, score, label in zip(boxes, scores, labels):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35] Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13] Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
```""" ```"""

View File

@@ -16,13 +16,18 @@
Image/Text processor class for OWLv2 Image/Text processor class for OWLv2
""" """
from typing import List import warnings
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np import numpy as np
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
from ...utils import is_flax_available, is_tf_available, is_torch_available from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
if TYPE_CHECKING:
from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput
class Owlv2Processor(ProcessorMixin): class Owlv2Processor(ProcessorMixin):
@@ -45,7 +50,7 @@ class Owlv2Processor(ProcessorMixin):
def __init__(self, image_processor, tokenizer, **kwargs): def __init__(self, image_processor, tokenizer, **kwargs):
super().__init__(image_processor, tokenizer) super().__init__(image_processor, tokenizer)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2 # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
""" """
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
@@ -157,21 +162,101 @@ class Owlv2Processor(ProcessorMixin):
else: else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2 # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2
def post_process_object_detection(self, *args, **kwargs): def post_process_object_detection(self, *args, **kwargs):
""" """
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer This method forwards all its arguments to [`Owlv2ImageProcessor.post_process_object_detection`]. Please refer
to the docstring of this method for more information. to the docstring of this method for more information.
""" """
warnings.warn(
"`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
"Use `post_process_grounded_object_detection` instead.",
FutureWarning,
)
return self.image_processor.post_process_object_detection(*args, **kwargs) return self.image_processor.post_process_object_detection(*args, **kwargs)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2 # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_grounded_object_detection with OwlViT->Owlv2
def post_process_image_guided_detection(self, *args, **kwargs): def post_process_grounded_object_detection(
self,
outputs: "Owlv2ObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
text_labels: Optional[List[List[str]]] = None,
):
""" """
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`]. Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
Please refer to the docstring of this method for more information. bottom_right_x, bottom_right_y) format.
Args:
outputs ([`Owlv2ObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
text_labels (`List[List[str]]`, *optional*):
List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
set to `None`.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
- "text_labels": The text labels for each predicted bounding box on the image.
""" """
return self.image_processor.post_process_image_guided_detection(*args, **kwargs) output = self.image_processor.post_process_object_detection(
outputs=outputs, threshold=threshold, target_sizes=target_sizes
)
if text_labels is not None and len(text_labels) != len(output):
raise ValueError("Make sure that you pass in as many lists of text labels as images")
# adding text labels to the output
if text_labels is not None:
for image_output, image_text_labels in zip(output, text_labels):
object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
image_output["text_labels"] = object_text_labels
else:
for image_output in output:
image_output["text_labels"] = None
return output
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OwlViT->Owlv2
def post_process_image_guided_detection(
self,
outputs: "Owlv2ImageGuidedObjectDetectionOutput",
threshold: float = 0.0,
nms_threshold: float = 0.3,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
):
"""
Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO
api.
Args:
outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.0):
Minimum confidence threshold to use to filter out predicted boxes.
nms_threshold (`float`, *optional*, defaults to 0.3):
IoU threshold for non-maximum suppression of overlapping boxes.
target_sizes (`torch.Tensor`, *optional*):
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
None, predictions will not be unnormalized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
- "labels": Set to `None`.
"""
return self.image_processor.post_process_image_guided_detection(
outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):

View File

@@ -15,7 +15,7 @@
"""Image processor class for OwlViT""" """Image processor class for OwlViT"""
import warnings import warnings
from typing import Dict, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
@@ -43,6 +43,9 @@ from ...image_utils import (
from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging
if TYPE_CHECKING:
from .modeling_owlvit import OwlViTObjectDetectionOutput
if is_torch_available(): if is_torch_available():
import torch import torch
@@ -58,6 +61,34 @@ def _upcast(t):
return t if t.dtype in (torch.int32, torch.int64) else t.int() return t if t.dtype in (torch.int32, torch.int64) else t.int()
def _scale_boxes(boxes, target_sizes):
"""
Scale batch of bounding boxes to the target sizes.
Args:
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
Returns:
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
"""
if isinstance(target_sizes, (list, tuple)):
image_height = torch.tensor([i[0] for i in target_sizes])
image_width = torch.tensor([i[1] for i in target_sizes])
elif isinstance(target_sizes, torch.Tensor):
image_height, image_width = target_sizes.unbind(1)
else:
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
boxes = boxes * scale_factor
return boxes
def box_area(boxes): def box_area(boxes):
""" """
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
@@ -459,7 +490,10 @@ class OwlViTImageProcessor(BaseImageProcessor):
return results return results
def post_process_object_detection( def post_process_object_detection(
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None self,
outputs: "OwlViTObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
): ):
""" """
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
@@ -468,52 +502,46 @@ class OwlViTImageProcessor(BaseImageProcessor):
Args: Args:
outputs ([`OwlViTObjectDetectionOutput`]): outputs ([`OwlViTObjectDetectionOutput`]):
Raw outputs of the model. Raw outputs of the model.
threshold (`float`, *optional*): threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions. Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized. `(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image `List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
in the batch as predicted by the model. - "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
""" """
# TODO: (amy) add support for other frameworks batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
logits, boxes = outputs.logits, outputs.pred_boxes batch_size = len(batch_logits)
if target_sizes is not None: if target_sizes is not None and len(target_sizes) != batch_size:
if len(logits) != len(target_sizes): raise ValueError("Make sure that you pass in as many target sizes as images")
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
probs = torch.max(logits, dim=-1) # batch_logits of shape (batch_size, num_queries, num_classes)
scores = torch.sigmoid(probs.values) batch_class_logits = torch.max(batch_logits, dim=-1)
labels = probs.indices batch_scores = torch.sigmoid(batch_class_logits.values)
batch_labels = batch_class_logits.indices
# Convert to [x0, y0, x1, y1] format # Convert to [x0, y0, x1, y1] format
boxes = center_to_corners_format(boxes) batch_boxes = center_to_corners_format(batch_boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates # Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None: if target_sizes is not None:
if isinstance(target_sizes, List): batch_boxes = _scale_boxes(batch_boxes, target_sizes)
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = [] results = []
for s, l, b in zip(scores, labels, boxes): for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
score = s[s > threshold] keep = scores > threshold
label = l[s > threshold] scores = scores[keep]
box = b[s > threshold] labels = labels[keep]
results.append({"scores": score, "labels": label, "boxes": box}) boxes = boxes[keep]
results.append({"scores": scores, "labels": labels, "boxes": boxes})
return results return results
# TODO: (Amy) Make compatible with other frameworks
def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None): def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
""" """
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
@@ -562,13 +590,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
# Convert from relative [0, 1] to absolute [0, height] coordinates # Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None: if target_sizes is not None:
if isinstance(target_sizes, List): target_boxes = _scale_boxes(target_boxes, target_sizes)
img_h = torch.tensor([i[0] for i in target_sizes])
img_w = torch.tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
target_boxes = target_boxes * scale_fct[:, None, :]
# Compute box display alphas based on prediction scores # Compute box display alphas based on prediction scores
results = [] results = []

View File

@@ -1689,31 +1689,30 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
>>> import requests >>> import requests
>>> from PIL import Image >>> from PIL import Image
>>> import torch >>> import torch
>>> from transformers import AutoProcessor, OwlViTForObjectDetection
>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32") >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
>>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = [["a photo of a cat", "a photo of a dog"]] >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=texts, images=image, return_tensors="pt") >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([(image.height, image.width)])
>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_object_detection( >>> results = processor.post_process_grounded_object_detection(
... outputs=outputs, threshold=0.1, target_sizes=target_sizes ... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
... ) ... )
>>> # Retrieve predictions for the first image for the corresponding text queries
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> result = results[0]
>>> text = texts[i] >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] >>> for box, score, text_label in zip(boxes, scores, text_labels):
>>> for box, score, label in zip(boxes, scores, labels):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
```""" ```"""

View File

@@ -17,13 +17,17 @@ Image/Text processor class for OWL-ViT
""" """
import warnings import warnings
from typing import List from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np import numpy as np
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
from ...utils import is_flax_available, is_tf_available, is_torch_available from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
if TYPE_CHECKING:
from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput
class OwlViTProcessor(ProcessorMixin): class OwlViTProcessor(ProcessorMixin):
@@ -184,14 +188,93 @@ class OwlViTProcessor(ProcessorMixin):
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
to the docstring of this method for more information. to the docstring of this method for more information.
""" """
warnings.warn(
"`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
"Use `post_process_grounded_object_detection` instead.",
FutureWarning,
)
return self.image_processor.post_process_object_detection(*args, **kwargs) return self.image_processor.post_process_object_detection(*args, **kwargs)
def post_process_image_guided_detection(self, *args, **kwargs): def post_process_grounded_object_detection(
self,
outputs: "OwlViTObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
text_labels: Optional[List[List[str]]] = None,
):
""" """
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`]. Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
Please refer to the docstring of this method for more information. bottom_right_x, bottom_right_y) format.
Args:
outputs ([`OwlViTObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
text_labels (`List[List[str]]`, *optional*):
List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
set to `None`.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
- "text_labels": The text labels for each predicted bounding box on the image.
""" """
return self.image_processor.post_process_image_guided_detection(*args, **kwargs) output = self.image_processor.post_process_object_detection(
outputs=outputs, threshold=threshold, target_sizes=target_sizes
)
if text_labels is not None and len(text_labels) != len(output):
raise ValueError("Make sure that you pass in as many lists of text labels as images")
# adding text labels to the output
if text_labels is not None:
for image_output, image_text_labels in zip(output, text_labels):
object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
image_output["text_labels"] = object_text_labels
else:
for image_output in output:
image_output["text_labels"] = None
return output
def post_process_image_guided_detection(
self,
outputs: "OwlViTImageGuidedObjectDetectionOutput",
threshold: float = 0.0,
nms_threshold: float = 0.3,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
):
"""
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
api.
Args:
outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.0):
Minimum confidence threshold to use to filter out predicted boxes.
nms_threshold (`float`, *optional*, defaults to 0.3):
IoU threshold for non-maximum suppression of overlapping boxes.
target_sizes (`torch.Tensor`, *optional*):
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
None, predictions will not be unnormalized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
- "labels": Set to `None`.
"""
return self.image_processor.post_process_image_guided_detection(
outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
)
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """

View File

@@ -974,8 +974,9 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
processor = OwlViTProcessor.from_pretrained(model_name) processor = OwlViTProcessor.from_pretrained(model_name)
image = prepare_img() image = prepare_img()
text_labels = [["a photo of a cat", "a photo of a dog"]]
inputs = processor( inputs = processor(
text=[["a photo of a cat", "a photo of a dog"]], text=text_labels,
images=image, images=image,
max_length=16, max_length=16,
padding="max_length", padding="max_length",
@@ -991,11 +992,31 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
expected_slice_logits = torch.tensor( expected_slice_logits = torch.tensor(
[[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]] [[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
).to(torch_device) ).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4)) resulted_slice_logits = outputs.logits[0, :3, :3]
max_diff = torch.max(torch.abs(resulted_slice_logits - expected_slice_logits)).item()
self.assertLess(max_diff, 3e-4)
expected_slice_boxes = torch.tensor( expected_slice_boxes = torch.tensor(
[[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]], [[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
).to(torch_device) ).to(torch_device)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) resulted_slice_boxes = outputs.pred_boxes[0, :3, :3]
max_diff = torch.max(torch.abs(resulted_slice_boxes - expected_slice_boxes)).item()
self.assertLess(max_diff, 3e-4)
# test post-processing
post_processed_output = processor.post_process_grounded_object_detection(outputs)
self.assertIsNone(post_processed_output[0]["text_labels"])
post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
outputs, text_labels=text_labels
)
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
self.assertListEqual(objects_labels, [0, 0])
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
self.assertIsNotNone(objects_text_labels)
self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
@slow @slow
def test_inference_one_shot_object_detection(self): def test_inference_one_shot_object_detection(self):

View File

@@ -967,8 +967,9 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
processor = OwlViTProcessor.from_pretrained(model_name) processor = OwlViTProcessor.from_pretrained(model_name)
image = prepare_img() image = prepare_img()
text_labels = [["a photo of a cat", "a photo of a dog"]]
inputs = processor( inputs = processor(
text=[["a photo of a cat", "a photo of a dog"]], text=text_labels,
images=image, images=image,
max_length=16, max_length=16,
padding="max_length", padding="max_length",
@@ -986,6 +987,21 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
).to(torch_device) ).to(torch_device)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
# test post-processing
post_processed_output = processor.post_process_grounded_object_detection(outputs)
self.assertIsNone(post_processed_output[0]["text_labels"])
post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
outputs, text_labels=text_labels
)
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
self.assertListEqual(objects_labels, [0, 0])
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
self.assertIsNotNone(objects_text_labels)
self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
@slow @slow
def test_inference_one_shot_object_detection(self): def test_inference_one_shot_object_detection(self):
model_name = "google/owlvit-base-patch32" model_name = "google/owlvit-base-patch32"

View File

@@ -973,6 +973,7 @@ DEPRECATED_OBJECTS = [
"xnli_processors", "xnli_processors",
"xnli_tasks_num_labels", "xnli_tasks_num_labels",
"TFTrainingArguments", "TFTrainingArguments",
"OwlViTFeatureExtractor",
] ]
# Exceptionally, some objects should not be documented after all rules passed. # Exceptionally, some objects should not be documented after all rules passed.