OmDet Turbo processor standardization (#34937)
* Fix docstring * Fix docstring * Add `classes_structure` to model output * Update omdet postprocessing * Adjust tests * Update code example in docs * Add deprecation to "classes" key in output * Types, docs * Fixing test * Fix missed clip_boxes * [run-slow] omdet_turbo * Apply suggestions from code review Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> * Make CamelCase class --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
94ae9a8da1
commit
42b2857b01
@@ -44,37 +44,40 @@ One unique property of OmDet-Turbo compared to other zero-shot object detection
|
|||||||
Here's how to load the model and prepare the inputs to perform zero-shot object detection on a single image:
|
Here's how to load the model and prepare the inputs to perform zero-shot object detection on a single image:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import requests
|
>>> import torch
|
||||||
from PIL import Image
|
>>> import requests
|
||||||
|
>>> from PIL import Image
|
||||||
|
|
||||||
from transformers import AutoProcessor, OmDetTurboForObjectDetection
|
>>> from transformers import AutoProcessor, OmDetTurboForObjectDetection
|
||||||
|
|
||||||
processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
|
>>> processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
|
||||||
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
|
>>> model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
|
||||||
|
|
||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
classes = ["cat", "remote"]
|
>>> text_labels = ["cat", "remote"]
|
||||||
inputs = processor(image, text=classes, return_tensors="pt")
|
>>> inputs = processor(image, text=text_labels, return_tensors="pt")
|
||||||
|
|
||||||
outputs = model(**inputs)
|
>>> with torch.no_grad():
|
||||||
|
... outputs = model(**inputs)
|
||||||
|
|
||||||
# convert outputs (bounding boxes and class logits)
|
>>> # convert outputs (bounding boxes and class logits)
|
||||||
results = processor.post_process_grounded_object_detection(
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
outputs,
|
... outputs,
|
||||||
classes=classes,
|
... target_sizes=[(image.height, image.width)],
|
||||||
target_sizes=[image.size[::-1]],
|
... text_labels=text_labels,
|
||||||
score_threshold=0.3,
|
... threshold=0.3,
|
||||||
nms_threshold=0.3,
|
... nms_threshold=0.3,
|
||||||
)[0]
|
... )
|
||||||
for score, class_name, box in zip(
|
>>> result = results[0]
|
||||||
results["scores"], results["classes"], results["boxes"]
|
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
|
||||||
):
|
>>> for box, score, text_label in zip(boxes, scores, text_labels):
|
||||||
box = [round(i, 1) for i in box.tolist()]
|
... box = [round(i, 2) for i in box.tolist()]
|
||||||
print(
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
f"Detected {class_name} with confidence "
|
Detected remote with confidence 0.768 at location [39.89, 70.35, 176.74, 118.04]
|
||||||
f"{round(score.item(), 2)} at location {box}"
|
Detected cat with confidence 0.72 at location [11.6, 54.19, 314.8, 473.95]
|
||||||
)
|
Detected remote with confidence 0.563 at location [333.38, 75.77, 370.7, 187.03]
|
||||||
|
Detected cat with confidence 0.552 at location [345.15, 23.95, 639.75, 371.67]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Multi image inference
|
### Multi image inference
|
||||||
@@ -93,22 +96,22 @@ OmDet-Turbo can perform batched multi-image inference, with support for differen
|
|||||||
|
|
||||||
>>> url1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image1 = Image.open(BytesIO(requests.get(url1).content)).convert("RGB")
|
>>> image1 = Image.open(BytesIO(requests.get(url1).content)).convert("RGB")
|
||||||
>>> classes1 = ["cat", "remote"]
|
>>> text_labels1 = ["cat", "remote"]
|
||||||
>>> task1 = "Detect {}.".format(", ".join(classes1))
|
>>> task1 = "Detect {}.".format(", ".join(text_labels1))
|
||||||
|
|
||||||
>>> url2 = "http://images.cocodataset.org/train2017/000000257813.jpg"
|
>>> url2 = "http://images.cocodataset.org/train2017/000000257813.jpg"
|
||||||
>>> image2 = Image.open(BytesIO(requests.get(url2).content)).convert("RGB")
|
>>> image2 = Image.open(BytesIO(requests.get(url2).content)).convert("RGB")
|
||||||
>>> classes2 = ["boat"]
|
>>> text_labels2 = ["boat"]
|
||||||
>>> task2 = "Detect everything that looks like a boat."
|
>>> task2 = "Detect everything that looks like a boat."
|
||||||
|
|
||||||
>>> url3 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
|
>>> url3 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
|
||||||
>>> image3 = Image.open(BytesIO(requests.get(url3).content)).convert("RGB")
|
>>> image3 = Image.open(BytesIO(requests.get(url3).content)).convert("RGB")
|
||||||
>>> classes3 = ["statue", "trees"]
|
>>> text_labels3 = ["statue", "trees"]
|
||||||
>>> task3 = "Focus on the foreground, detect statue and trees."
|
>>> task3 = "Focus on the foreground, detect statue and trees."
|
||||||
|
|
||||||
>>> inputs = processor(
|
>>> inputs = processor(
|
||||||
... images=[image1, image2, image3],
|
... images=[image1, image2, image3],
|
||||||
... text=[classes1, classes2, classes3],
|
... text=[text_labels1, text_labels2, text_labels3],
|
||||||
... task=[task1, task2, task3],
|
... task=[task1, task2, task3],
|
||||||
... return_tensors="pt",
|
... return_tensors="pt",
|
||||||
... )
|
... )
|
||||||
@@ -119,19 +122,19 @@ OmDet-Turbo can perform batched multi-image inference, with support for differen
|
|||||||
>>> # convert outputs (bounding boxes and class logits)
|
>>> # convert outputs (bounding boxes and class logits)
|
||||||
>>> results = processor.post_process_grounded_object_detection(
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
... outputs,
|
... outputs,
|
||||||
... classes=[classes1, classes2, classes3],
|
... text_labels=[text_labels1, text_labels2, text_labels3],
|
||||||
... target_sizes=[image1.size[::-1], image2.size[::-1], image3.size[::-1]],
|
... target_sizes=[(image.height, image.width) for image in [image1, image2, image3]],
|
||||||
... score_threshold=0.2,
|
... threshold=0.2,
|
||||||
... nms_threshold=0.3,
|
... nms_threshold=0.3,
|
||||||
... )
|
... )
|
||||||
|
|
||||||
>>> for i, result in enumerate(results):
|
>>> for i, result in enumerate(results):
|
||||||
... for score, class_name, box in zip(
|
... for score, text_label, box in zip(
|
||||||
... result["scores"], result["classes"], result["boxes"]
|
... result["scores"], result["text_labels"], result["boxes"]
|
||||||
... ):
|
... ):
|
||||||
... box = [round(i, 1) for i in box.tolist()]
|
... box = [round(i, 1) for i in box.tolist()]
|
||||||
... print(
|
... print(
|
||||||
... f"Detected {class_name} with confidence "
|
... f"Detected {text_label} with confidence "
|
||||||
... f"{round(score.item(), 2)} at location {box} in image {i}"
|
... f"{round(score.item(), 2)} at location {box} in image {i}"
|
||||||
... )
|
... )
|
||||||
Detected remote with confidence 0.77 at location [39.9, 70.4, 176.7, 118.0] in image 0
|
Detected remote with confidence 0.77 at location [39.9, 70.4, 176.7, 118.0] in image 0
|
||||||
|
|||||||
@@ -143,22 +143,24 @@ class OmDetTurboObjectDetectionOutput(ModelOutput):
|
|||||||
The predicted class of the objects from the encoder.
|
The predicted class of the objects from the encoder.
|
||||||
encoder_extracted_states (`torch.FloatTensor`):
|
encoder_extracted_states (`torch.FloatTensor`):
|
||||||
The extracted states from the Feature Pyramid Network (FPN) and Path Aggregation Network (PAN) of the encoder.
|
The extracted states from the Feature Pyramid Network (FPN) and Path Aggregation Network (PAN) of the encoder.
|
||||||
decoder_hidden_states (`Optional[Tuple[torch.FloatTensor]]`):
|
decoder_hidden_states (`Tuple[torch.FloatTensor]`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape
|
||||||
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||||
plus the initial embedding outputs.
|
plus the initial embedding outputs.
|
||||||
decoder_attentions (`Optional[Tuple[Tuple[torch.FloatTensor]]]`):
|
decoder_attentions (`Tuple[Tuple[torch.FloatTensor]]`, *optional*):
|
||||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||||
encoder_hidden_states (`Optional[Tuple[torch.FloatTensor]]`):
|
encoder_hidden_states (`Tuple[torch.FloatTensor]`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape
|
||||||
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||||
plus the initial embedding outputs.
|
plus the initial embedding outputs.
|
||||||
encoder_attentions (`Optional[Tuple[Tuple[torch.FloatTensor]]]`):
|
encoder_attentions (`Tuple[Tuple[torch.FloatTensor]]`, *optional*):
|
||||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||||
|
classes_structure (`torch.LongTensor`, *optional*):
|
||||||
|
The number of queried classes for each image.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: torch.FloatTensor = None
|
loss: torch.FloatTensor = None
|
||||||
@@ -173,6 +175,7 @@ class OmDetTurboObjectDetectionOutput(ModelOutput):
|
|||||||
decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||||
|
classes_structure: Optional[torch.LongTensor] = None
|
||||||
|
|
||||||
|
|
||||||
# Copied from models.deformable_detr.load_cuda_kernels
|
# Copied from models.deformable_detr.load_cuda_kernels
|
||||||
@@ -1667,16 +1670,16 @@ class OmDetTurboForObjectDetection(OmDetTurboPreTrainedModel):
|
|||||||
@replace_return_docstrings(output_type=OmDetTurboObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
|
@replace_return_docstrings(output_type=OmDetTurboObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
pixel_values: Tensor,
|
pixel_values: torch.FloatTensor,
|
||||||
classes_input_ids: Tensor,
|
classes_input_ids: torch.LongTensor,
|
||||||
classes_attention_mask: Tensor,
|
classes_attention_mask: torch.LongTensor,
|
||||||
tasks_input_ids: Tensor,
|
tasks_input_ids: torch.LongTensor,
|
||||||
tasks_attention_mask: Tensor,
|
tasks_attention_mask: torch.LongTensor,
|
||||||
classes_structure: Tensor,
|
classes_structure: torch.LongTensor,
|
||||||
labels: Optional[Tensor] = None,
|
labels: Optional[torch.LongTensor] = None,
|
||||||
output_attentions=None,
|
output_attentions: Optional[bool] = None,
|
||||||
output_hidden_states=None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
return_dict=None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[Tuple[torch.FloatTensor], OmDetTurboObjectDetectionOutput]:
|
) -> Union[Tuple[torch.FloatTensor], OmDetTurboObjectDetectionOutput]:
|
||||||
r"""
|
r"""
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1770,6 +1773,7 @@ class OmDetTurboForObjectDetection(OmDetTurboPreTrainedModel):
|
|||||||
decoder_outputs[2],
|
decoder_outputs[2],
|
||||||
encoder_outputs[1],
|
encoder_outputs[1],
|
||||||
encoder_outputs[2],
|
encoder_outputs[2],
|
||||||
|
classes_structure,
|
||||||
]
|
]
|
||||||
if output is not None
|
if output is not None
|
||||||
)
|
)
|
||||||
@@ -1787,6 +1791,7 @@ class OmDetTurboForObjectDetection(OmDetTurboPreTrainedModel):
|
|||||||
decoder_attentions=decoder_outputs.attentions,
|
decoder_attentions=decoder_outputs.attentions,
|
||||||
encoder_hidden_states=encoder_outputs.hidden_states,
|
encoder_hidden_states=encoder_outputs.hidden_states,
|
||||||
encoder_attentions=encoder_outputs.attentions,
|
encoder_attentions=encoder_outputs.attentions,
|
||||||
|
classes_structure=classes_structure,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,8 @@
|
|||||||
Processor class for OmDet-Turbo.
|
Processor class for OmDet-Turbo.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Optional, Tuple, Union
|
import warnings
|
||||||
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_transforms import center_to_corners_format
|
from ...image_transforms import center_to_corners_format
|
||||||
@@ -28,12 +29,25 @@ from ...utils import (
|
|||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torchvision_available,
|
is_torchvision_available,
|
||||||
)
|
)
|
||||||
|
from ...utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_omdet_turbo import OmDetTurboObjectDetectionOutput
|
||||||
|
|
||||||
|
|
||||||
class OmDetTurboTextKwargs(TextKwargs, total=False):
|
class OmDetTurboTextKwargs(TextKwargs, total=False):
|
||||||
task: Optional[Union[str, List[str], TextInput, PreTokenizedInput]]
|
task: Optional[Union[str, List[str], TextInput, PreTokenizedInput]]
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from torchvision.ops.boxes import batched_nms
|
||||||
|
|
||||||
|
|
||||||
class OmDetTurboProcessorKwargs(ProcessingKwargs, total=False):
|
class OmDetTurboProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
text_kwargs: OmDetTurboTextKwargs
|
text_kwargs: OmDetTurboTextKwargs
|
||||||
_defaults = {
|
_defaults = {
|
||||||
@@ -55,11 +69,23 @@ class OmDetTurboProcessorKwargs(ProcessingKwargs, total=False):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
class DictWithDeprecationWarning(dict):
|
||||||
import torch
|
message = (
|
||||||
|
"The `classes` key is deprecated for `OmDetTurboProcessor.post_process_grounded_object_detection` "
|
||||||
|
"output dict and will be removed in a 4.51.0 version. Please use `text_labels` instead."
|
||||||
|
)
|
||||||
|
|
||||||
if is_torchvision_available():
|
def __getitem__(self, key):
|
||||||
from torchvision.ops.boxes import batched_nms
|
if key == "classes":
|
||||||
|
warnings.warn(self.message, FutureWarning)
|
||||||
|
return super().__getitem__("text_labels")
|
||||||
|
return super().__getitem__(key)
|
||||||
|
|
||||||
|
def get(self, key, *args, **kwargs):
|
||||||
|
if key == "classes":
|
||||||
|
warnings.warn(self.message, FutureWarning)
|
||||||
|
return super().get("text_labels", *args, **kwargs)
|
||||||
|
return super().get(key, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def clip_boxes(box, box_size: Tuple[int, int]):
|
def clip_boxes(box, box_size: Tuple[int, int]):
|
||||||
@@ -97,76 +123,80 @@ def compute_score(boxes):
|
|||||||
|
|
||||||
|
|
||||||
def _post_process_boxes_for_image(
|
def _post_process_boxes_for_image(
|
||||||
boxes: TensorType,
|
boxes: "torch.Tensor",
|
||||||
scores: TensorType,
|
scores: "torch.Tensor",
|
||||||
predicted_classes: TensorType,
|
labels: "torch.Tensor",
|
||||||
classes: List[str],
|
image_num_classes: int,
|
||||||
image_size: Tuple[int, int],
|
image_size: Tuple[int, int],
|
||||||
num_classes: int,
|
threshold: float,
|
||||||
score_threshold: float,
|
|
||||||
nms_threshold: float,
|
nms_threshold: float,
|
||||||
max_num_det: int = None,
|
max_num_det: Optional[int] = None,
|
||||||
) -> dict:
|
) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
|
||||||
"""
|
"""
|
||||||
Filter predicted results using given thresholds and NMS.
|
Filter predicted results using given thresholds and NMS.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
boxes (torch.Tensor): A Tensor of predicted class-specific or class-agnostic
|
boxes (`torch.Tensor`):
|
||||||
boxes for the image. Shape : (num_queries, max_num_classes_in_batch * 4) if doing
|
A Tensor of predicted class-specific or class-agnostic boxes for the image.
|
||||||
class-specific regression, or (num_queries, 4) if doing class-agnostic
|
Shape (num_queries, max_num_classes_in_batch * 4) if doing class-specific regression,
|
||||||
regression.
|
or (num_queries, 4) if doing class-agnostic regression.
|
||||||
scores (torch.Tensor): A Tensor of predicted class scores for the image.
|
scores (`torch.Tensor` of shape (num_queries, max_num_classes_in_batch + 1)):
|
||||||
Shape : (num_queries, max_num_classes_in_batch + 1)
|
A Tensor of predicted class scores for the image.
|
||||||
predicted_classes (torch.Tensor): A Tensor of predicted classes for the image.
|
labels (`torch.Tensor` of shape (num_queries * (max_num_classes_in_batch + 1),)):
|
||||||
Shape : (num_queries * (max_num_classes_in_batch + 1),)
|
A Tensor of predicted labels for the image.
|
||||||
classes (List[str]): The input classes names.
|
image_num_classes (`int`):
|
||||||
image_size (tuple): A tuple of (height, width) for the image.
|
The number of classes queried for detection on the image.
|
||||||
num_classes (int): The number of classes given for this image.
|
image_size (`Tuple[int, int]`):
|
||||||
score_threshold (float): Only return detections with a confidence score exceeding this
|
A tuple of (height, width) for the image.
|
||||||
threshold.
|
threshold (`float`):
|
||||||
nms_threshold (float): The threshold to use for box non-maximum suppression. Value in [0, 1].
|
Only return detections with a confidence score exceeding this threshold.
|
||||||
max_num_det (int, optional): The maximum number of detections to return. Default is None.
|
nms_threshold (`float`):
|
||||||
|
The threshold to use for box non-maximum suppression. Value in [0, 1].
|
||||||
|
max_num_det (`int`, *optional*):
|
||||||
|
The maximum number of detections to return. Default is None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: A dictionary the following keys:
|
Tuple: A tuple with the following:
|
||||||
"boxes" (Tensor): A tensor of shape (num_filtered_objects, 4), containing the predicted boxes in (x1, y1, x2, y2) format.
|
"boxes" (Tensor): A tensor of shape (num_filtered_objects, 4), containing the predicted boxes in (x1, y1, x2, y2) format.
|
||||||
"scores" (Tensor): A tensor of shape (num_filtered_objects,), containing the predicted confidence scores for each detection.
|
"scores" (Tensor): A tensor of shape (num_filtered_objects,), containing the predicted confidence scores for each detection.
|
||||||
"classes" (List[str]): A list of strings, where each string is the predicted class for the
|
"labels" (Tensor): A tensor of ids, where each id is the predicted class id for the corresponding detection
|
||||||
corresponding detection
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Filter by max number of detections
|
||||||
proposal_num = len(boxes) if max_num_det is None else max_num_det
|
proposal_num = len(boxes) if max_num_det is None else max_num_det
|
||||||
scores_per_image, topk_indices = scores.flatten(0, 1).topk(proposal_num, sorted=False)
|
scores_per_image, topk_indices = scores.flatten(0, 1).topk(proposal_num, sorted=False)
|
||||||
classes_per_image = predicted_classes[topk_indices]
|
labels_per_image = labels[topk_indices]
|
||||||
box_pred_per_image = boxes.view(-1, 1, 4).repeat(1, num_classes, 1).view(-1, 4)
|
boxes_per_image = boxes.view(-1, 1, 4).repeat(1, scores.shape[1], 1).view(-1, 4)
|
||||||
box_pred_per_image = box_pred_per_image[topk_indices]
|
boxes_per_image = boxes_per_image[topk_indices]
|
||||||
|
|
||||||
# Score filtering
|
# Convert and scale boxes to original image size
|
||||||
box_pred_per_image = center_to_corners_format(box_pred_per_image)
|
boxes_per_image = center_to_corners_format(boxes_per_image)
|
||||||
box_pred_per_image = box_pred_per_image * torch.tensor(image_size[::-1]).repeat(2).to(box_pred_per_image.device)
|
boxes_per_image = boxes_per_image * torch.tensor(image_size[::-1]).repeat(2).to(boxes_per_image.device)
|
||||||
filter_mask = scores_per_image > score_threshold # R x K
|
|
||||||
|
# Filtering by confidence score
|
||||||
|
filter_mask = scores_per_image > threshold # R x K
|
||||||
score_keep = filter_mask.nonzero(as_tuple=False).view(-1)
|
score_keep = filter_mask.nonzero(as_tuple=False).view(-1)
|
||||||
box_pred_per_image = box_pred_per_image[score_keep]
|
boxes_per_image = boxes_per_image[score_keep]
|
||||||
scores_per_image = scores_per_image[score_keep]
|
scores_per_image = scores_per_image[score_keep]
|
||||||
classes_per_image = classes_per_image[score_keep]
|
labels_per_image = labels_per_image[score_keep]
|
||||||
|
|
||||||
filter_classes_mask = classes_per_image < len(classes)
|
# Ensure we did not overflow to non existing classes
|
||||||
|
filter_classes_mask = labels_per_image < image_num_classes
|
||||||
classes_keep = filter_classes_mask.nonzero(as_tuple=False).view(-1)
|
classes_keep = filter_classes_mask.nonzero(as_tuple=False).view(-1)
|
||||||
box_pred_per_image = box_pred_per_image[classes_keep]
|
boxes_per_image = boxes_per_image[classes_keep]
|
||||||
scores_per_image = scores_per_image[classes_keep]
|
scores_per_image = scores_per_image[classes_keep]
|
||||||
classes_per_image = classes_per_image[classes_keep]
|
labels_per_image = labels_per_image[classes_keep]
|
||||||
|
|
||||||
# NMS
|
# NMS
|
||||||
keep = batched_nms(box_pred_per_image, scores_per_image, classes_per_image, nms_threshold)
|
keep = batched_nms(boxes_per_image, scores_per_image, labels_per_image, nms_threshold)
|
||||||
box_pred_per_image = box_pred_per_image[keep]
|
boxes_per_image = boxes_per_image[keep]
|
||||||
scores_per_image = scores_per_image[keep]
|
scores_per_image = scores_per_image[keep]
|
||||||
classes_per_image = classes_per_image[keep]
|
labels_per_image = labels_per_image[keep]
|
||||||
classes_per_image = [classes[i] for i in classes_per_image]
|
|
||||||
|
|
||||||
# create an instance
|
# Clip to image size
|
||||||
result = {}
|
boxes_per_image = clip_boxes(boxes_per_image, image_size)
|
||||||
result["boxes"] = clip_boxes(box_pred_per_image, image_size)
|
|
||||||
result["scores"] = scores_per_image
|
|
||||||
result["classes"] = classes_per_image
|
|
||||||
|
|
||||||
return result
|
return boxes_per_image, scores_per_image, labels_per_image
|
||||||
|
|
||||||
|
|
||||||
class OmDetTurboProcessor(ProcessorMixin):
|
class OmDetTurboProcessor(ProcessorMixin):
|
||||||
@@ -274,11 +304,26 @@ class OmDetTurboProcessor(ProcessorMixin):
|
|||||||
"""
|
"""
|
||||||
return self.tokenizer.decode(*args, **kwargs)
|
return self.tokenizer.decode(*args, **kwargs)
|
||||||
|
|
||||||
|
def _get_default_image_size(self) -> Tuple[int, int]:
|
||||||
|
height = (
|
||||||
|
self.image_processor.size["height"]
|
||||||
|
if "height" in self.image_processor.size
|
||||||
|
else self.image_processor.size["shortest_edge"]
|
||||||
|
)
|
||||||
|
width = (
|
||||||
|
self.image_processor.size["width"]
|
||||||
|
if "width" in self.image_processor.size
|
||||||
|
else self.image_processor.size["longest_edge"]
|
||||||
|
)
|
||||||
|
return height, width
|
||||||
|
|
||||||
|
@deprecate_kwarg("score_threshold", new_name="threshold", version="4.51.0")
|
||||||
|
@deprecate_kwarg("classes", new_name="text_labels", version="4.51.0")
|
||||||
def post_process_grounded_object_detection(
|
def post_process_grounded_object_detection(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs: "OmDetTurboObjectDetectionOutput",
|
||||||
classes: Union[List[str], List[List[str]]],
|
text_labels: Optional[Union[List[str], List[List[str]]]] = None,
|
||||||
score_threshold: float = 0.3,
|
threshold: float = 0.3,
|
||||||
nms_threshold: float = 0.5,
|
nms_threshold: float = 0.5,
|
||||||
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
max_num_det: Optional[int] = None,
|
max_num_det: Optional[int] = None,
|
||||||
@@ -290,67 +335,77 @@ class OmDetTurboProcessor(ProcessorMixin):
|
|||||||
Args:
|
Args:
|
||||||
outputs ([`OmDetTurboObjectDetectionOutput`]):
|
outputs ([`OmDetTurboObjectDetectionOutput`]):
|
||||||
Raw outputs of the model.
|
Raw outputs of the model.
|
||||||
classes (Union[List[str], List[List[str]]]): The input classes names.
|
text_labels (Union[List[str], List[List[str]]], *optional*):
|
||||||
score_threshold (float, defaults to 0.3): Only return detections with a confidence score exceeding this
|
The input classes names. If not provided, `text_labels` will be set to `None` in `outputs`.
|
||||||
threshold.
|
threshold (float, defaults to 0.3):
|
||||||
nms_threshold (float, defaults to 0.5): The threshold to use for box non-maximum suppression. Value in [0, 1].
|
Only return detections with a confidence score exceeding this threshold.
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*, defaults to None):
|
nms_threshold (float, defaults to 0.5):
|
||||||
|
The threshold to use for box non-maximum suppression. Value in [0, 1].
|
||||||
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
max_num_det (int, *optional*, defaults to None): The maximum number of detections to return.
|
max_num_det (`int`, *optional*):
|
||||||
|
The maximum number of detections to return.
|
||||||
Returns:
|
Returns:
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, classes and boxes for an image
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, classes and boxes for an image
|
||||||
in the batch as predicted by the model.
|
in the batch as predicted by the model.
|
||||||
"""
|
"""
|
||||||
if isinstance(classes[0], str):
|
|
||||||
classes = [classes]
|
|
||||||
|
|
||||||
boxes_logits = outputs.decoder_coord_logits
|
batch_size = len(outputs.decoder_coord_logits)
|
||||||
scores_logits = outputs.decoder_class_logits
|
|
||||||
|
|
||||||
# Inputs consistency check
|
# Inputs consistency check for target sizes
|
||||||
if target_sizes is None:
|
if target_sizes is None:
|
||||||
height = (
|
height, width = self._get_default_image_size()
|
||||||
self.image_processor.size["height"]
|
target_sizes = [(height, width)] * batch_size
|
||||||
if "height" in self.image_processor.size
|
|
||||||
else self.image_processor.size["shortest_edge"]
|
if any(len(image_size) != 2 for image_size in target_sizes):
|
||||||
)
|
|
||||||
width = (
|
|
||||||
self.image_processor.size["width"]
|
|
||||||
if "width" in self.image_processor.size
|
|
||||||
else self.image_processor.size["longest_edge"]
|
|
||||||
)
|
|
||||||
target_sizes = ((height, width),) * len(boxes_logits)
|
|
||||||
elif len(target_sizes[0]) != 2:
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Each element of target_sizes must contain the size (height, width) of each image of the batch"
|
"Each element of target_sizes must contain the size (height, width) of each image of the batch"
|
||||||
)
|
)
|
||||||
if len(target_sizes) != len(boxes_logits):
|
|
||||||
|
if len(target_sizes) != batch_size:
|
||||||
raise ValueError("Make sure that you pass in as many target sizes as output sequences")
|
raise ValueError("Make sure that you pass in as many target sizes as output sequences")
|
||||||
if len(classes) != len(boxes_logits):
|
|
||||||
|
# Inputs consistency check for text labels
|
||||||
|
if text_labels is not None and isinstance(text_labels[0], str):
|
||||||
|
text_labels = [text_labels]
|
||||||
|
|
||||||
|
if text_labels is not None and len(text_labels) != batch_size:
|
||||||
raise ValueError("Make sure that you pass in as many classes group as output sequences")
|
raise ValueError("Make sure that you pass in as many classes group as output sequences")
|
||||||
|
|
||||||
# Convert target_sizes to list for easier handling
|
# Convert target_sizes to list for easier handling
|
||||||
if isinstance(target_sizes, torch.Tensor):
|
if isinstance(target_sizes, torch.Tensor):
|
||||||
target_sizes = target_sizes.tolist()
|
target_sizes = target_sizes.tolist()
|
||||||
|
|
||||||
scores, predicted_classes = compute_score(scores_logits)
|
batch_boxes = outputs.decoder_coord_logits
|
||||||
num_classes = scores_logits.shape[2]
|
batch_logits = outputs.decoder_class_logits
|
||||||
|
batch_num_classes = outputs.classes_structure
|
||||||
|
|
||||||
|
batch_scores, batch_labels = compute_score(batch_logits)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for scores_img, box_per_img, image_size, class_names in zip(scores, boxes_logits, target_sizes, classes):
|
for boxes, scores, image_size, image_num_classes in zip(
|
||||||
results.append(
|
batch_boxes, batch_scores, target_sizes, batch_num_classes
|
||||||
_post_process_boxes_for_image(
|
):
|
||||||
box_per_img,
|
boxes, scores, labels = _post_process_boxes_for_image(
|
||||||
scores_img,
|
boxes=boxes,
|
||||||
predicted_classes,
|
scores=scores,
|
||||||
class_names,
|
labels=batch_labels,
|
||||||
image_size,
|
image_num_classes=image_num_classes,
|
||||||
num_classes,
|
image_size=image_size,
|
||||||
score_threshold=score_threshold,
|
threshold=threshold,
|
||||||
nms_threshold=nms_threshold,
|
nms_threshold=nms_threshold,
|
||||||
max_num_det=max_num_det,
|
max_num_det=max_num_det,
|
||||||
)
|
)
|
||||||
|
result = DictWithDeprecationWarning(
|
||||||
|
{"boxes": boxes, "scores": scores, "labels": labels, "text_labels": None}
|
||||||
)
|
)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Add text labels
|
||||||
|
if text_labels is not None:
|
||||||
|
for result, image_text_labels in zip(results, text_labels):
|
||||||
|
result["text_labels"] = [image_text_labels[idx] for idx in result["labels"]]
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -646,9 +646,9 @@ def prepare_img():
|
|||||||
|
|
||||||
|
|
||||||
def prepare_text():
|
def prepare_text():
|
||||||
classes = ["cat", "remote"]
|
text_labels = ["cat", "remote"]
|
||||||
task = "Detect {}.".format(", ".join(classes))
|
task = "Detect {}.".format(", ".join(text_labels))
|
||||||
return classes, task
|
return text_labels, task
|
||||||
|
|
||||||
|
|
||||||
def prepare_img_batched():
|
def prepare_img_batched():
|
||||||
@@ -660,14 +660,14 @@ def prepare_img_batched():
|
|||||||
|
|
||||||
|
|
||||||
def prepare_text_batched():
|
def prepare_text_batched():
|
||||||
classes1 = ["cat", "remote"]
|
text_labels1 = ["cat", "remote"]
|
||||||
classes2 = ["boat"]
|
text_labels2 = ["boat"]
|
||||||
classes3 = ["statue", "trees", "torch"]
|
text_labels3 = ["statue", "trees", "torch"]
|
||||||
|
|
||||||
task1 = "Detect {}.".format(", ".join(classes1))
|
task1 = "Detect {}.".format(", ".join(text_labels1))
|
||||||
task2 = "Detect all the boat in the image."
|
task2 = "Detect all the boat in the image."
|
||||||
task3 = "Focus on the foreground, detect statue, torch and trees."
|
task3 = "Focus on the foreground, detect statue, torch and trees."
|
||||||
return [classes1, classes2, classes3], [task1, task2, task3]
|
return [text_labels1, text_labels2, text_labels3], [task1, task2, task3]
|
||||||
|
|
||||||
|
|
||||||
@require_timm
|
@require_timm
|
||||||
@@ -683,8 +683,8 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
processor = self.default_processor
|
processor = self.default_processor
|
||||||
image = prepare_img()
|
image = prepare_img()
|
||||||
classes, task = prepare_text()
|
text_labels, task = prepare_text()
|
||||||
encoding = processor(images=image, text=classes, task=task, return_tensors="pt").to(torch_device)
|
encoding = processor(images=image, text=text_labels, task=task, return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**encoding)
|
outputs = model(**encoding)
|
||||||
@@ -706,7 +706,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify grounded postprocessing
|
# verify grounded postprocessing
|
||||||
results = processor.post_process_grounded_object_detection(
|
results = processor.post_process_grounded_object_detection(
|
||||||
outputs, classes=[classes], target_sizes=[image.size[::-1]]
|
outputs, text_labels=[text_labels], target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.7675, 0.7196, 0.5634, 0.5524]).to(torch_device)
|
expected_scores = torch.tensor([0.7675, 0.7196, 0.5634, 0.5524]).to(torch_device)
|
||||||
expected_slice_boxes = torch.tensor([39.8870, 70.3522, 176.7424, 118.0354]).to(torch_device)
|
expected_slice_boxes = torch.tensor([39.8870, 70.3522, 176.7424, 118.0354]).to(torch_device)
|
||||||
@@ -715,8 +715,8 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-2))
|
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-2))
|
||||||
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
|
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
|
||||||
|
|
||||||
expected_classes = ["remote", "cat", "remote", "cat"]
|
expected_text_labels = ["remote", "cat", "remote", "cat"]
|
||||||
self.assertListEqual(results["classes"], expected_classes)
|
self.assertListEqual(results["text_labels"], expected_text_labels)
|
||||||
|
|
||||||
def test_inference_object_detection_head_fp16(self):
|
def test_inference_object_detection_head_fp16(self):
|
||||||
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf").to(
|
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf").to(
|
||||||
@@ -725,8 +725,8 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
processor = self.default_processor
|
processor = self.default_processor
|
||||||
image = prepare_img()
|
image = prepare_img()
|
||||||
classes, task = prepare_text()
|
text_labels, task = prepare_text()
|
||||||
encoding = processor(images=image, text=classes, task=task, return_tensors="pt").to(
|
encoding = processor(images=image, text=text_labels, task=task, return_tensors="pt").to(
|
||||||
torch_device, dtype=torch.float16
|
torch_device, dtype=torch.float16
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -750,7 +750,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify grounded postprocessing
|
# verify grounded postprocessing
|
||||||
results = processor.post_process_grounded_object_detection(
|
results = processor.post_process_grounded_object_detection(
|
||||||
outputs, classes=[classes], target_sizes=[image.size[::-1]]
|
outputs, text_labels=[text_labels], target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.7675, 0.7196, 0.5634, 0.5524]).to(torch_device, dtype=torch.float16)
|
expected_scores = torch.tensor([0.7675, 0.7196, 0.5634, 0.5524]).to(torch_device, dtype=torch.float16)
|
||||||
expected_slice_boxes = torch.tensor([39.8870, 70.3522, 176.7424, 118.0354]).to(
|
expected_slice_boxes = torch.tensor([39.8870, 70.3522, 176.7424, 118.0354]).to(
|
||||||
@@ -761,16 +761,16 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-2))
|
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-2))
|
||||||
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-1))
|
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-1))
|
||||||
|
|
||||||
expected_classes = ["remote", "cat", "remote", "cat"]
|
expected_text_labels = ["remote", "cat", "remote", "cat"]
|
||||||
self.assertListEqual(results["classes"], expected_classes)
|
self.assertListEqual(results["text_labels"], expected_text_labels)
|
||||||
|
|
||||||
def test_inference_object_detection_head_no_task(self):
|
def test_inference_object_detection_head_no_task(self):
|
||||||
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf").to(torch_device)
|
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf").to(torch_device)
|
||||||
|
|
||||||
processor = self.default_processor
|
processor = self.default_processor
|
||||||
image = prepare_img()
|
image = prepare_img()
|
||||||
classes, _ = prepare_text()
|
text_labels, _ = prepare_text()
|
||||||
encoding = processor(images=image, text=classes, return_tensors="pt").to(torch_device)
|
encoding = processor(images=image, text=text_labels, return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**encoding)
|
outputs = model(**encoding)
|
||||||
@@ -792,7 +792,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify grounded postprocessing
|
# verify grounded postprocessing
|
||||||
results = processor.post_process_grounded_object_detection(
|
results = processor.post_process_grounded_object_detection(
|
||||||
outputs, classes=[classes], target_sizes=[image.size[::-1]]
|
outputs, text_labels=[text_labels], target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.7675, 0.7196, 0.5634, 0.5524]).to(torch_device)
|
expected_scores = torch.tensor([0.7675, 0.7196, 0.5634, 0.5524]).to(torch_device)
|
||||||
expected_slice_boxes = torch.tensor([39.8870, 70.3522, 176.7424, 118.0354]).to(torch_device)
|
expected_slice_boxes = torch.tensor([39.8870, 70.3522, 176.7424, 118.0354]).to(torch_device)
|
||||||
@@ -801,8 +801,8 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-2))
|
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-2))
|
||||||
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
|
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
|
||||||
|
|
||||||
expected_classes = ["remote", "cat", "remote", "cat"]
|
expected_text_labels = ["remote", "cat", "remote", "cat"]
|
||||||
self.assertListEqual(results["classes"], expected_classes)
|
self.assertListEqual(results["text_labels"], expected_text_labels)
|
||||||
|
|
||||||
def test_inference_object_detection_head_batched(self):
|
def test_inference_object_detection_head_batched(self):
|
||||||
torch_device = "cpu"
|
torch_device = "cpu"
|
||||||
@@ -810,10 +810,10 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
processor = self.default_processor
|
processor = self.default_processor
|
||||||
images_batched = prepare_img_batched()
|
images_batched = prepare_img_batched()
|
||||||
classes_batched, tasks_batched = prepare_text_batched()
|
text_labels_batched, tasks_batched = prepare_text_batched()
|
||||||
encoding = processor(images=images_batched, text=classes_batched, task=tasks_batched, return_tensors="pt").to(
|
encoding = processor(
|
||||||
torch_device
|
images=images_batched, text=text_labels_batched, task=tasks_batched, return_tensors="pt"
|
||||||
)
|
).to(torch_device)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**encoding)
|
outputs = model(**encoding)
|
||||||
@@ -837,7 +837,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
# verify grounded postprocessing
|
# verify grounded postprocessing
|
||||||
results = processor.post_process_grounded_object_detection(
|
results = processor.post_process_grounded_object_detection(
|
||||||
outputs,
|
outputs,
|
||||||
classes=classes_batched,
|
text_labels=text_labels_batched,
|
||||||
target_sizes=[image.size[::-1] for image in images_batched],
|
target_sizes=[image.size[::-1] for image in images_batched],
|
||||||
score_threshold=0.2,
|
score_threshold=0.2,
|
||||||
)
|
)
|
||||||
@@ -858,19 +858,19 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
torch.allclose(torch.stack([result["boxes"][0, :] for result in results]), expected_slice_boxes, atol=1e-2)
|
torch.allclose(torch.stack([result["boxes"][0, :] for result in results]), expected_slice_boxes, atol=1e-2)
|
||||||
)
|
)
|
||||||
|
|
||||||
expected_classes = [
|
expected_text_labels = [
|
||||||
["remote", "cat", "remote", "cat"],
|
["remote", "cat", "remote", "cat"],
|
||||||
["boat", "boat", "boat", "boat"],
|
["boat", "boat", "boat", "boat"],
|
||||||
["statue", "trees", "trees", "torch", "statue", "statue"],
|
["statue", "trees", "trees", "torch", "statue", "statue"],
|
||||||
]
|
]
|
||||||
self.assertListEqual([result["classes"] for result in results], expected_classes)
|
self.assertListEqual([result["text_labels"] for result in results], expected_text_labels)
|
||||||
|
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
|
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
|
||||||
processor = self.default_processor
|
processor = self.default_processor
|
||||||
image = prepare_img()
|
image = prepare_img()
|
||||||
classes, task = prepare_text()
|
text_labels, task = prepare_text()
|
||||||
encoding = processor(images=image, text=classes, task=task, return_tensors="pt")
|
encoding = processor(images=image, text=text_labels, task=task, return_tensors="pt")
|
||||||
# 1. run model on CPU
|
# 1. run model on CPU
|
||||||
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
|
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
|
||||||
|
|
||||||
@@ -894,10 +894,10 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify grounded postprocessing
|
# verify grounded postprocessing
|
||||||
results_cpu = processor.post_process_grounded_object_detection(
|
results_cpu = processor.post_process_grounded_object_detection(
|
||||||
cpu_outputs, classes=[classes], target_sizes=[image.size[::-1]]
|
cpu_outputs, text_labels=[text_labels], target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
result_gpu = processor.post_process_grounded_object_detection(
|
result_gpu = processor.post_process_grounded_object_detection(
|
||||||
gpu_outputs, classes=[classes], target_sizes=[image.size[::-1]]
|
gpu_outputs, text_labels=[text_labels], target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
self.assertTrue(torch.allclose(results_cpu["scores"], result_gpu["scores"].cpu(), atol=1e-2))
|
self.assertTrue(torch.allclose(results_cpu["scores"], result_gpu["scores"].cpu(), atol=1e-2))
|
||||||
|
|||||||
@@ -76,10 +76,13 @@ class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
|
||||||
def get_fake_omdet_turbo_output(self):
|
def get_fake_omdet_turbo_output(self):
|
||||||
|
classes = self.get_fake_omdet_turbo_classes()
|
||||||
|
classes_structure = torch.tensor([len(sublist) for sublist in classes])
|
||||||
torch.manual_seed(42)
|
torch.manual_seed(42)
|
||||||
return OmDetTurboObjectDetectionOutput(
|
return OmDetTurboObjectDetectionOutput(
|
||||||
decoder_coord_logits=torch.rand(self.batch_size, self.num_queries, 4),
|
decoder_coord_logits=torch.rand(self.batch_size, self.num_queries, 4),
|
||||||
decoder_class_logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
|
decoder_class_logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
|
||||||
|
classes_structure=classes_structure,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_fake_omdet_turbo_classes(self):
|
def get_fake_omdet_turbo_classes(self):
|
||||||
@@ -99,7 +102,7 @@ class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(len(post_processed), self.batch_size)
|
self.assertEqual(len(post_processed), self.batch_size)
|
||||||
self.assertEqual(list(post_processed[0].keys()), ["boxes", "scores", "classes"])
|
self.assertEqual(list(post_processed[0].keys()), ["boxes", "scores", "labels", "text_labels"])
|
||||||
self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4))
|
self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4))
|
||||||
self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,))
|
self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,))
|
||||||
expected_scores = torch.tensor([0.7310, 0.6579, 0.6513, 0.6444, 0.6252])
|
expected_scores = torch.tensor([0.7310, 0.6579, 0.6513, 0.6444, 0.6252])
|
||||||
|
|||||||
Reference in New Issue
Block a user