OwlViT/Owlv2 post processing standardization (#34929)
* Refactor owlvit post_process_object_detection + add text_labels * Fix copies in grounding dino * Sync with Owlv2 postprocessing * Add post_process_grounded_object_detection method to processor, deprecate post_process_object_detection * Add test cases * Move text_labels to processors only * [run-slow] owlvit owlv2 * [run-slow] owlvit, owlv2 * Update snippets * Update docs structure * Update deprecated objects for check_repo * Update docstring for post processing of image guided object detection
This commit is contained in:
committed by
GitHub
parent
add5f0566c
commit
94ae9a8da1
@@ -50,20 +50,22 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
|
|||||||
|
|
||||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
>>> texts = [["a photo of a cat", "a photo of a dog"]]
|
>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
|
||||||
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
|
>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
|
||||||
>>> outputs = model(**inputs)
|
>>> outputs = model(**inputs)
|
||||||
|
|
||||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
>>> target_sizes = torch.tensor([(image.height, image.width)])
|
||||||
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
|
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||||
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
|
||||||
>>> text = texts[i]
|
... )
|
||||||
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
|
>>> # Retrieve predictions for the first image for the corresponding text queries
|
||||||
>>> for box, score, label in zip(boxes, scores, labels):
|
>>> result = results[0]
|
||||||
|
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
|
||||||
|
>>> for box, score, text_label in zip(boxes, scores, text_labels):
|
||||||
... box = [round(i, 2) for i in box.tolist()]
|
... box = [round(i, 2) for i in box.tolist()]
|
||||||
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
|
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
|
||||||
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
|
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
|
||||||
```
|
```
|
||||||
@@ -103,6 +105,9 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
|
|||||||
## Owlv2Processor
|
## Owlv2Processor
|
||||||
|
|
||||||
[[autodoc]] Owlv2Processor
|
[[autodoc]] Owlv2Processor
|
||||||
|
- __call__
|
||||||
|
- post_process_grounded_object_detection
|
||||||
|
- post_process_image_guided_detection
|
||||||
|
|
||||||
## Owlv2Model
|
## Owlv2Model
|
||||||
|
|
||||||
|
|||||||
@@ -49,20 +49,22 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
|
|||||||
|
|
||||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
>>> texts = [["a photo of a cat", "a photo of a dog"]]
|
>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
|
||||||
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
|
>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
|
||||||
>>> outputs = model(**inputs)
|
>>> outputs = model(**inputs)
|
||||||
|
|
||||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
>>> target_sizes = torch.tensor([(image.height, image.width)])
|
||||||
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
|
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||||
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
|
||||||
>>> text = texts[i]
|
... )
|
||||||
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
|
>>> # Retrieve predictions for the first image for the corresponding text queries
|
||||||
>>> for box, score, label in zip(boxes, scores, labels):
|
>>> result = results[0]
|
||||||
|
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
|
||||||
|
>>> for box, score, text_label in zip(boxes, scores, text_labels):
|
||||||
... box = [round(i, 2) for i in box.tolist()]
|
... box = [round(i, 2) for i in box.tolist()]
|
||||||
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
|
Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
|
||||||
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
|
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
|
||||||
```
|
```
|
||||||
@@ -91,16 +93,12 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de
|
|||||||
- post_process_object_detection
|
- post_process_object_detection
|
||||||
- post_process_image_guided_detection
|
- post_process_image_guided_detection
|
||||||
|
|
||||||
## OwlViTFeatureExtractor
|
|
||||||
|
|
||||||
[[autodoc]] OwlViTFeatureExtractor
|
|
||||||
- __call__
|
|
||||||
- post_process
|
|
||||||
- post_process_image_guided_detection
|
|
||||||
|
|
||||||
## OwlViTProcessor
|
## OwlViTProcessor
|
||||||
|
|
||||||
[[autodoc]] OwlViTProcessor
|
[[autodoc]] OwlViTProcessor
|
||||||
|
- __call__
|
||||||
|
- post_process_grounded_object_detection
|
||||||
|
- post_process_image_guided_detection
|
||||||
|
|
||||||
## OwlViTModel
|
## OwlViTModel
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
import io
|
import io
|
||||||
import pathlib
|
import pathlib
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -77,6 +77,9 @@ if is_scipy_available():
|
|||||||
import scipy.special
|
import scipy.special
|
||||||
import scipy.stats
|
import scipy.stats
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
@@ -806,6 +809,35 @@ def compute_segments(
|
|||||||
return segmentation, segments
|
return segmentation, segments
|
||||||
|
|
||||||
|
|
||||||
|
# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes
|
||||||
|
def _scale_boxes(boxes, target_sizes):
|
||||||
|
"""
|
||||||
|
Scale batch of bounding boxes to the target sizes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
|
||||||
|
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
|
||||||
|
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if isinstance(target_sizes, (list, tuple)):
|
||||||
|
image_height = torch.tensor([i[0] for i in target_sizes])
|
||||||
|
image_width = torch.tensor([i[1] for i in target_sizes])
|
||||||
|
elif isinstance(target_sizes, torch.Tensor):
|
||||||
|
image_height, image_width = target_sizes.unbind(1)
|
||||||
|
else:
|
||||||
|
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
|
||||||
|
|
||||||
|
scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
|
||||||
|
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_factor
|
||||||
|
return boxes
|
||||||
|
|
||||||
|
|
||||||
class GroundingDinoImageProcessor(BaseImageProcessor):
|
class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Grounding DINO image processor.
|
Constructs a Grounding DINO image processor.
|
||||||
@@ -1533,7 +1565,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
|||||||
|
|
||||||
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
|
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
|
||||||
def post_process_object_detection(
|
def post_process_object_detection(
|
||||||
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
|
self,
|
||||||
|
outputs: "GroundingDinoObjectDetectionOutput",
|
||||||
|
threshold: float = 0.1,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
@@ -1542,48 +1577,43 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
|||||||
Args:
|
Args:
|
||||||
outputs ([`GroundingDinoObjectDetectionOutput`]):
|
outputs ([`GroundingDinoObjectDetectionOutput`]):
|
||||||
Raw outputs of the model.
|
Raw outputs of the model.
|
||||||
threshold (`float`, *optional*):
|
threshold (`float`, *optional*, defaults to 0.1):
|
||||||
Score threshold to keep object detection predictions.
|
Score threshold to keep object detection predictions.
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
in the batch as predicted by the model.
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "labels": Indexes of the classes predicted by the model on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
"""
|
"""
|
||||||
# TODO: (amy) add support for other frameworks
|
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
|
||||||
logits, boxes = outputs.logits, outputs.pred_boxes
|
batch_size = len(batch_logits)
|
||||||
|
|
||||||
if target_sizes is not None:
|
if target_sizes is not None and len(target_sizes) != batch_size:
|
||||||
if len(logits) != len(target_sizes):
|
raise ValueError("Make sure that you pass in as many target sizes as images")
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
probs = torch.max(logits, dim=-1)
|
# batch_logits of shape (batch_size, num_queries, num_classes)
|
||||||
scores = torch.sigmoid(probs.values)
|
batch_class_logits = torch.max(batch_logits, dim=-1)
|
||||||
labels = probs.indices
|
batch_scores = torch.sigmoid(batch_class_logits.values)
|
||||||
|
batch_labels = batch_class_logits.indices
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
# Convert to [x0, y0, x1, y1] format
|
||||||
boxes = center_to_corners_format(boxes)
|
batch_boxes = center_to_corners_format(batch_boxes)
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
if target_sizes is not None:
|
if target_sizes is not None:
|
||||||
if isinstance(target_sizes, List):
|
batch_boxes = _scale_boxes(batch_boxes, target_sizes)
|
||||||
img_h = torch.Tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.Tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for s, l, b in zip(scores, labels, boxes):
|
for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
|
||||||
score = s[s > threshold]
|
keep = scores > threshold
|
||||||
label = l[s > threshold]
|
scores = scores[keep]
|
||||||
box = b[s > threshold]
|
labels = labels[keep]
|
||||||
results.append({"scores": score, "labels": label, "boxes": box})
|
boxes = boxes[keep]
|
||||||
|
results.append({"scores": scores, "labels": labels, "boxes": boxes})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
"""Image processor class for OWLv2."""
|
"""Image processor class for OWLv2."""
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -60,10 +60,43 @@ if is_vision_available():
|
|||||||
if is_scipy_available():
|
if is_scipy_available():
|
||||||
from scipy import ndimage as ndi
|
from scipy import ndimage as ndi
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_owlv2 import Owlv2ObjectDetectionOutput
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _scale_boxes(boxes, target_sizes):
|
||||||
|
"""
|
||||||
|
Scale batch of bounding boxes to the target sizes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
|
||||||
|
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
|
||||||
|
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if isinstance(target_sizes, (list, tuple)):
|
||||||
|
image_height = torch.tensor([i[0] for i in target_sizes])
|
||||||
|
image_width = torch.tensor([i[1] for i in target_sizes])
|
||||||
|
elif isinstance(target_sizes, torch.Tensor):
|
||||||
|
image_height, image_width = target_sizes.unbind(1)
|
||||||
|
else:
|
||||||
|
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
|
||||||
|
|
||||||
|
# for owlv2 image is padded to max size unlike owlvit, thats why we have to scale boxes to max size
|
||||||
|
max_size = torch.max(image_height, image_width)
|
||||||
|
|
||||||
|
scale_factor = torch.stack([max_size, max_size, max_size, max_size], dim=1)
|
||||||
|
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_factor
|
||||||
|
return boxes
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.image_processing_owlvit._upcast
|
# Copied from transformers.models.owlvit.image_processing_owlvit._upcast
|
||||||
def _upcast(t):
|
def _upcast(t):
|
||||||
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
|
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
|
||||||
@@ -466,62 +499,57 @@ class Owlv2ImageProcessor(BaseImageProcessor):
|
|||||||
data = {"pixel_values": images}
|
data = {"pixel_values": images}
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->Owlv2
|
||||||
def post_process_object_detection(
|
def post_process_object_detection(
|
||||||
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
|
self,
|
||||||
|
outputs: "Owlv2ObjectDetectionOutput",
|
||||||
|
threshold: float = 0.1,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
bottom_right_x, bottom_right_y) format.
|
bottom_right_x, bottom_right_y) format.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
outputs ([`OwlViTObjectDetectionOutput`]):
|
outputs ([`Owlv2ObjectDetectionOutput`]):
|
||||||
Raw outputs of the model.
|
Raw outputs of the model.
|
||||||
threshold (`float`, *optional*):
|
threshold (`float`, *optional*, defaults to 0.1):
|
||||||
Score threshold to keep object detection predictions.
|
Score threshold to keep object detection predictions.
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
in the batch as predicted by the model.
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "labels": Indexes of the classes predicted by the model on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
"""
|
"""
|
||||||
# TODO: (amy) add support for other frameworks
|
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
|
||||||
logits, boxes = outputs.logits, outputs.pred_boxes
|
batch_size = len(batch_logits)
|
||||||
|
|
||||||
if target_sizes is not None:
|
if target_sizes is not None and len(target_sizes) != batch_size:
|
||||||
if len(logits) != len(target_sizes):
|
raise ValueError("Make sure that you pass in as many target sizes as images")
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
probs = torch.max(logits, dim=-1)
|
# batch_logits of shape (batch_size, num_queries, num_classes)
|
||||||
scores = torch.sigmoid(probs.values)
|
batch_class_logits = torch.max(batch_logits, dim=-1)
|
||||||
labels = probs.indices
|
batch_scores = torch.sigmoid(batch_class_logits.values)
|
||||||
|
batch_labels = batch_class_logits.indices
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
# Convert to [x0, y0, x1, y1] format
|
||||||
boxes = center_to_corners_format(boxes)
|
batch_boxes = center_to_corners_format(batch_boxes)
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
if target_sizes is not None:
|
if target_sizes is not None:
|
||||||
if isinstance(target_sizes, List):
|
batch_boxes = _scale_boxes(batch_boxes, target_sizes)
|
||||||
img_h = torch.Tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.Tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
|
|
||||||
# Rescale coordinates, image is padded to square for inference,
|
|
||||||
# that is why we need to scale boxes to the max size
|
|
||||||
size = torch.max(img_h, img_w)
|
|
||||||
scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)
|
|
||||||
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for s, l, b in zip(scores, labels, boxes):
|
for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
|
||||||
score = s[s > threshold]
|
keep = scores > threshold
|
||||||
label = l[s > threshold]
|
scores = scores[keep]
|
||||||
box = b[s > threshold]
|
labels = labels[keep]
|
||||||
results.append({"scores": score, "labels": label, "boxes": box})
|
boxes = boxes[keep]
|
||||||
|
results.append({"scores": scores, "labels": labels, "boxes": boxes})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@@ -574,13 +602,7 @@ class Owlv2ImageProcessor(BaseImageProcessor):
|
|||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
if target_sizes is not None:
|
if target_sizes is not None:
|
||||||
if isinstance(target_sizes, List):
|
target_boxes = _scale_boxes(target_boxes, target_sizes)
|
||||||
img_h = torch.tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
|
|
||||||
target_boxes = target_boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
# Compute box display alphas based on prediction scores
|
# Compute box display alphas based on prediction scores
|
||||||
results = []
|
results = []
|
||||||
|
|||||||
@@ -1749,33 +1749,30 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
|
|||||||
>>> import requests
|
>>> import requests
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image
|
||||||
>>> import torch
|
>>> import torch
|
||||||
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection
|
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
>>> from transformers import Owlv2Processor, Owlv2ForObjectDetection
|
||||||
|
|
||||||
|
>>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
||||||
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
|
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
|
||||||
|
|
||||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
>>> texts = [["a photo of a cat", "a photo of a dog"]]
|
>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
|
||||||
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
|
>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
|
||||||
|
>>> outputs = model(**inputs)
|
||||||
|
|
||||||
>>> # forward pass
|
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||||
>>> with torch.no_grad():
|
>>> target_sizes = torch.tensor([(image.height, image.width)])
|
||||||
... outputs = model(**inputs)
|
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||||
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
|
||||||
>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
|
|
||||||
>>> results = processor.post_process_object_detection(
|
|
||||||
... outputs=outputs, threshold=0.2, target_sizes=target_sizes
|
|
||||||
... )
|
... )
|
||||||
|
>>> # Retrieve predictions for the first image for the corresponding text queries
|
||||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
>>> result = results[0]
|
||||||
>>> text = texts[i]
|
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
|
||||||
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
|
>>> for box, score, text_label in zip(boxes, scores, text_labels):
|
||||||
|
|
||||||
>>> for box, score, label in zip(boxes, scores, labels):
|
|
||||||
... box = [round(i, 2) for i in box.tolist()]
|
... box = [round(i, 2) for i in box.tolist()]
|
||||||
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
|
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
|
||||||
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
|
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
|
||||||
```"""
|
```"""
|
||||||
|
|||||||
@@ -16,13 +16,18 @@
|
|||||||
Image/Text processor class for OWLv2
|
Image/Text processor class for OWLv2
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List
|
import warnings
|
||||||
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from ...utils import is_flax_available, is_tf_available, is_torch_available
|
from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput
|
||||||
|
|
||||||
|
|
||||||
class Owlv2Processor(ProcessorMixin):
|
class Owlv2Processor(ProcessorMixin):
|
||||||
@@ -45,7 +50,7 @@ class Owlv2Processor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor, tokenizer, **kwargs):
|
def __init__(self, image_processor, tokenizer, **kwargs):
|
||||||
super().__init__(image_processor, tokenizer)
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2
|
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
|
||||||
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
|
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||||
@@ -157,21 +162,101 @@ class Owlv2Processor(ProcessorMixin):
|
|||||||
else:
|
else:
|
||||||
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
|
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2
|
||||||
def post_process_object_detection(self, *args, **kwargs):
|
def post_process_object_detection(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
|
This method forwards all its arguments to [`Owlv2ImageProcessor.post_process_object_detection`]. Please refer
|
||||||
to the docstring of this method for more information.
|
to the docstring of this method for more information.
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
|
||||||
|
"Use `post_process_grounded_object_detection` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
return self.image_processor.post_process_object_detection(*args, **kwargs)
|
return self.image_processor.post_process_object_detection(*args, **kwargs)
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2
|
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_grounded_object_detection with OwlViT->Owlv2
|
||||||
def post_process_image_guided_detection(self, *args, **kwargs):
|
def post_process_grounded_object_detection(
|
||||||
|
self,
|
||||||
|
outputs: "Owlv2ObjectDetectionOutput",
|
||||||
|
threshold: float = 0.1,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
|
text_labels: Optional[List[List[str]]] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
|
Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
Please refer to the docstring of this method for more information.
|
bottom_right_x, bottom_right_y) format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`Owlv2ObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.1):
|
||||||
|
Score threshold to keep object detection predictions.
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
text_labels (`List[List[str]]`, *optional*):
|
||||||
|
List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
|
||||||
|
set to `None`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "labels": Indexes of the classes predicted by the model on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
|
- "text_labels": The text labels for each predicted bounding box on the image.
|
||||||
"""
|
"""
|
||||||
return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
|
output = self.image_processor.post_process_object_detection(
|
||||||
|
outputs=outputs, threshold=threshold, target_sizes=target_sizes
|
||||||
|
)
|
||||||
|
|
||||||
|
if text_labels is not None and len(text_labels) != len(output):
|
||||||
|
raise ValueError("Make sure that you pass in as many lists of text labels as images")
|
||||||
|
|
||||||
|
# adding text labels to the output
|
||||||
|
if text_labels is not None:
|
||||||
|
for image_output, image_text_labels in zip(output, text_labels):
|
||||||
|
object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
|
||||||
|
image_output["text_labels"] = object_text_labels
|
||||||
|
else:
|
||||||
|
for image_output in output:
|
||||||
|
image_output["text_labels"] = None
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OwlViT->Owlv2
|
||||||
|
def post_process_image_guided_detection(
|
||||||
|
self,
|
||||||
|
outputs: "Owlv2ImageGuidedObjectDetectionOutput",
|
||||||
|
threshold: float = 0.0,
|
||||||
|
nms_threshold: float = 0.3,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO
|
||||||
|
api.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.0):
|
||||||
|
Minimum confidence threshold to use to filter out predicted boxes.
|
||||||
|
nms_threshold (`float`, *optional*, defaults to 0.3):
|
||||||
|
IoU threshold for non-maximum suppression of overlapping boxes.
|
||||||
|
target_sizes (`torch.Tensor`, *optional*):
|
||||||
|
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
|
||||||
|
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
|
||||||
|
None, predictions will not be unnormalized.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
|
- "labels": Set to `None`.
|
||||||
|
"""
|
||||||
|
return self.image_processor.post_process_image_guided_detection(
|
||||||
|
outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
|
||||||
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
|
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
"""Image processor class for OwlViT"""
|
"""Image processor class for OwlViT"""
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -43,6 +43,9 @@ from ...image_utils import (
|
|||||||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging
|
from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_owlvit import OwlViTObjectDetectionOutput
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -58,6 +61,34 @@ def _upcast(t):
|
|||||||
return t if t.dtype in (torch.int32, torch.int64) else t.int()
|
return t if t.dtype in (torch.int32, torch.int64) else t.int()
|
||||||
|
|
||||||
|
|
||||||
|
def _scale_boxes(boxes, target_sizes):
|
||||||
|
"""
|
||||||
|
Scale batch of bounding boxes to the target sizes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
|
||||||
|
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
|
||||||
|
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if isinstance(target_sizes, (list, tuple)):
|
||||||
|
image_height = torch.tensor([i[0] for i in target_sizes])
|
||||||
|
image_width = torch.tensor([i[1] for i in target_sizes])
|
||||||
|
elif isinstance(target_sizes, torch.Tensor):
|
||||||
|
image_height, image_width = target_sizes.unbind(1)
|
||||||
|
else:
|
||||||
|
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
|
||||||
|
|
||||||
|
scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
|
||||||
|
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
|
||||||
|
boxes = boxes * scale_factor
|
||||||
|
return boxes
|
||||||
|
|
||||||
|
|
||||||
def box_area(boxes):
|
def box_area(boxes):
|
||||||
"""
|
"""
|
||||||
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
|
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
|
||||||
@@ -459,7 +490,10 @@ class OwlViTImageProcessor(BaseImageProcessor):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def post_process_object_detection(
|
def post_process_object_detection(
|
||||||
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
|
self,
|
||||||
|
outputs: "OwlViTObjectDetectionOutput",
|
||||||
|
threshold: float = 0.1,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
@@ -468,52 +502,46 @@ class OwlViTImageProcessor(BaseImageProcessor):
|
|||||||
Args:
|
Args:
|
||||||
outputs ([`OwlViTObjectDetectionOutput`]):
|
outputs ([`OwlViTObjectDetectionOutput`]):
|
||||||
Raw outputs of the model.
|
Raw outputs of the model.
|
||||||
threshold (`float`, *optional*):
|
threshold (`float`, *optional*, defaults to 0.1):
|
||||||
Score threshold to keep object detection predictions.
|
Score threshold to keep object detection predictions.
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
in the batch as predicted by the model.
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "labels": Indexes of the classes predicted by the model on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
"""
|
"""
|
||||||
# TODO: (amy) add support for other frameworks
|
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
|
||||||
logits, boxes = outputs.logits, outputs.pred_boxes
|
batch_size = len(batch_logits)
|
||||||
|
|
||||||
if target_sizes is not None:
|
if target_sizes is not None and len(target_sizes) != batch_size:
|
||||||
if len(logits) != len(target_sizes):
|
raise ValueError("Make sure that you pass in as many target sizes as images")
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
probs = torch.max(logits, dim=-1)
|
# batch_logits of shape (batch_size, num_queries, num_classes)
|
||||||
scores = torch.sigmoid(probs.values)
|
batch_class_logits = torch.max(batch_logits, dim=-1)
|
||||||
labels = probs.indices
|
batch_scores = torch.sigmoid(batch_class_logits.values)
|
||||||
|
batch_labels = batch_class_logits.indices
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
# Convert to [x0, y0, x1, y1] format
|
||||||
boxes = center_to_corners_format(boxes)
|
batch_boxes = center_to_corners_format(batch_boxes)
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
if target_sizes is not None:
|
if target_sizes is not None:
|
||||||
if isinstance(target_sizes, List):
|
batch_boxes = _scale_boxes(batch_boxes, target_sizes)
|
||||||
img_h = torch.Tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.Tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for s, l, b in zip(scores, labels, boxes):
|
for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
|
||||||
score = s[s > threshold]
|
keep = scores > threshold
|
||||||
label = l[s > threshold]
|
scores = scores[keep]
|
||||||
box = b[s > threshold]
|
labels = labels[keep]
|
||||||
results.append({"scores": score, "labels": label, "boxes": box})
|
boxes = boxes[keep]
|
||||||
|
results.append({"scores": scores, "labels": labels, "boxes": boxes})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# TODO: (Amy) Make compatible with other frameworks
|
|
||||||
def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
|
def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
|
||||||
"""
|
"""
|
||||||
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
|
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
|
||||||
@@ -562,13 +590,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
|
|||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
if target_sizes is not None:
|
if target_sizes is not None:
|
||||||
if isinstance(target_sizes, List):
|
target_boxes = _scale_boxes(target_boxes, target_sizes)
|
||||||
img_h = torch.tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
|
|
||||||
target_boxes = target_boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
# Compute box display alphas based on prediction scores
|
# Compute box display alphas based on prediction scores
|
||||||
results = []
|
results = []
|
||||||
|
|||||||
@@ -1689,31 +1689,30 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
|||||||
>>> import requests
|
>>> import requests
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image
|
||||||
>>> import torch
|
>>> import torch
|
||||||
>>> from transformers import AutoProcessor, OwlViTForObjectDetection
|
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
|
>>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
||||||
|
|
||||||
|
>>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
|
||||||
>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
|
>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
|
||||||
|
|
||||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
>>> texts = [["a photo of a cat", "a photo of a dog"]]
|
>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
|
||||||
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
|
>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
|
||||||
>>> outputs = model(**inputs)
|
>>> outputs = model(**inputs)
|
||||||
|
|
||||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
>>> target_sizes = torch.tensor([(image.height, image.width)])
|
||||||
>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
|
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||||
>>> results = processor.post_process_object_detection(
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
... outputs=outputs, threshold=0.1, target_sizes=target_sizes
|
... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
|
||||||
... )
|
... )
|
||||||
|
>>> # Retrieve predictions for the first image for the corresponding text queries
|
||||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
>>> result = results[0]
|
||||||
>>> text = texts[i]
|
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
|
||||||
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
|
>>> for box, score, text_label in zip(boxes, scores, text_labels):
|
||||||
|
|
||||||
>>> for box, score, label in zip(boxes, scores, labels):
|
|
||||||
... box = [round(i, 2) for i in box.tolist()]
|
... box = [round(i, 2) for i in box.tolist()]
|
||||||
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
|
Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
|
||||||
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
|
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
|
||||||
```"""
|
```"""
|
||||||
|
|||||||
@@ -17,13 +17,17 @@ Image/Text processor class for OWL-ViT
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from typing import List
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from ...utils import is_flax_available, is_tf_available, is_torch_available
|
from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput
|
||||||
|
|
||||||
|
|
||||||
class OwlViTProcessor(ProcessorMixin):
|
class OwlViTProcessor(ProcessorMixin):
|
||||||
@@ -184,14 +188,93 @@ class OwlViTProcessor(ProcessorMixin):
|
|||||||
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
|
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
|
||||||
to the docstring of this method for more information.
|
to the docstring of this method for more information.
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
|
||||||
|
"Use `post_process_grounded_object_detection` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
return self.image_processor.post_process_object_detection(*args, **kwargs)
|
return self.image_processor.post_process_object_detection(*args, **kwargs)
|
||||||
|
|
||||||
def post_process_image_guided_detection(self, *args, **kwargs):
|
def post_process_grounded_object_detection(
|
||||||
|
self,
|
||||||
|
outputs: "OwlViTObjectDetectionOutput",
|
||||||
|
threshold: float = 0.1,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
|
text_labels: Optional[List[List[str]]] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
|
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
Please refer to the docstring of this method for more information.
|
bottom_right_x, bottom_right_y) format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`OwlViTObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.1):
|
||||||
|
Score threshold to keep object detection predictions.
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
text_labels (`List[List[str]]`, *optional*):
|
||||||
|
List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
|
||||||
|
set to `None`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "labels": Indexes of the classes predicted by the model on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
|
- "text_labels": The text labels for each predicted bounding box on the image.
|
||||||
"""
|
"""
|
||||||
return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
|
output = self.image_processor.post_process_object_detection(
|
||||||
|
outputs=outputs, threshold=threshold, target_sizes=target_sizes
|
||||||
|
)
|
||||||
|
|
||||||
|
if text_labels is not None and len(text_labels) != len(output):
|
||||||
|
raise ValueError("Make sure that you pass in as many lists of text labels as images")
|
||||||
|
|
||||||
|
# adding text labels to the output
|
||||||
|
if text_labels is not None:
|
||||||
|
for image_output, image_text_labels in zip(output, text_labels):
|
||||||
|
object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
|
||||||
|
image_output["text_labels"] = object_text_labels
|
||||||
|
else:
|
||||||
|
for image_output in output:
|
||||||
|
image_output["text_labels"] = None
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def post_process_image_guided_detection(
|
||||||
|
self,
|
||||||
|
outputs: "OwlViTImageGuidedObjectDetectionOutput",
|
||||||
|
threshold: float = 0.0,
|
||||||
|
nms_threshold: float = 0.3,
|
||||||
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
|
||||||
|
api.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.0):
|
||||||
|
Minimum confidence threshold to use to filter out predicted boxes.
|
||||||
|
nms_threshold (`float`, *optional*, defaults to 0.3):
|
||||||
|
IoU threshold for non-maximum suppression of overlapping boxes.
|
||||||
|
target_sizes (`torch.Tensor`, *optional*):
|
||||||
|
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
|
||||||
|
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
|
||||||
|
None, predictions will not be unnormalized.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
|
||||||
|
- "scores": The confidence scores for each predicted box on the image.
|
||||||
|
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
|
||||||
|
- "labels": Set to `None`.
|
||||||
|
"""
|
||||||
|
return self.image_processor.post_process_image_guided_detection(
|
||||||
|
outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
|
||||||
|
)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -974,8 +974,9 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
|
|||||||
processor = OwlViTProcessor.from_pretrained(model_name)
|
processor = OwlViTProcessor.from_pretrained(model_name)
|
||||||
|
|
||||||
image = prepare_img()
|
image = prepare_img()
|
||||||
|
text_labels = [["a photo of a cat", "a photo of a dog"]]
|
||||||
inputs = processor(
|
inputs = processor(
|
||||||
text=[["a photo of a cat", "a photo of a dog"]],
|
text=text_labels,
|
||||||
images=image,
|
images=image,
|
||||||
max_length=16,
|
max_length=16,
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
@@ -991,11 +992,31 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
|
|||||||
expected_slice_logits = torch.tensor(
|
expected_slice_logits = torch.tensor(
|
||||||
[[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
|
[[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
|
resulted_slice_logits = outputs.logits[0, :3, :3]
|
||||||
|
max_diff = torch.max(torch.abs(resulted_slice_logits - expected_slice_logits)).item()
|
||||||
|
self.assertLess(max_diff, 3e-4)
|
||||||
|
|
||||||
expected_slice_boxes = torch.tensor(
|
expected_slice_boxes = torch.tensor(
|
||||||
[[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
|
[[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
|
resulted_slice_boxes = outputs.pred_boxes[0, :3, :3]
|
||||||
|
max_diff = torch.max(torch.abs(resulted_slice_boxes - expected_slice_boxes)).item()
|
||||||
|
self.assertLess(max_diff, 3e-4)
|
||||||
|
|
||||||
|
# test post-processing
|
||||||
|
post_processed_output = processor.post_process_grounded_object_detection(outputs)
|
||||||
|
self.assertIsNone(post_processed_output[0]["text_labels"])
|
||||||
|
|
||||||
|
post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
|
||||||
|
outputs, text_labels=text_labels
|
||||||
|
)
|
||||||
|
|
||||||
|
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
|
||||||
|
self.assertListEqual(objects_labels, [0, 0])
|
||||||
|
|
||||||
|
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
|
||||||
|
self.assertIsNotNone(objects_text_labels)
|
||||||
|
self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_inference_one_shot_object_detection(self):
|
def test_inference_one_shot_object_detection(self):
|
||||||
|
|||||||
@@ -967,8 +967,9 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
|
|||||||
processor = OwlViTProcessor.from_pretrained(model_name)
|
processor = OwlViTProcessor.from_pretrained(model_name)
|
||||||
|
|
||||||
image = prepare_img()
|
image = prepare_img()
|
||||||
|
text_labels = [["a photo of a cat", "a photo of a dog"]]
|
||||||
inputs = processor(
|
inputs = processor(
|
||||||
text=[["a photo of a cat", "a photo of a dog"]],
|
text=text_labels,
|
||||||
images=image,
|
images=image,
|
||||||
max_length=16,
|
max_length=16,
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
@@ -986,6 +987,21 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
|
|||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
|
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
|
||||||
|
|
||||||
|
# test post-processing
|
||||||
|
post_processed_output = processor.post_process_grounded_object_detection(outputs)
|
||||||
|
self.assertIsNone(post_processed_output[0]["text_labels"])
|
||||||
|
|
||||||
|
post_processed_output_with_text_labels = processor.post_process_grounded_object_detection(
|
||||||
|
outputs, text_labels=text_labels
|
||||||
|
)
|
||||||
|
|
||||||
|
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
|
||||||
|
self.assertListEqual(objects_labels, [0, 0])
|
||||||
|
|
||||||
|
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
|
||||||
|
self.assertIsNotNone(objects_text_labels)
|
||||||
|
self.assertListEqual(objects_text_labels, ["a photo of a cat", "a photo of a cat"])
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_inference_one_shot_object_detection(self):
|
def test_inference_one_shot_object_detection(self):
|
||||||
model_name = "google/owlvit-base-patch32"
|
model_name = "google/owlvit-base-patch32"
|
||||||
|
|||||||
@@ -973,6 +973,7 @@ DEPRECATED_OBJECTS = [
|
|||||||
"xnli_processors",
|
"xnli_processors",
|
||||||
"xnli_tasks_num_labels",
|
"xnli_tasks_num_labels",
|
||||||
"TFTrainingArguments",
|
"TFTrainingArguments",
|
||||||
|
"OwlViTFeatureExtractor",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Exceptionally, some objects should not be documented after all rules passed.
|
# Exceptionally, some objects should not be documented after all rules passed.
|
||||||
|
|||||||
Reference in New Issue
Block a user