Grounding DINO Processor standardization (#34853)
* Add input ids to model output * Add text preprocessing for processor * Fix snippet * Add test for equivalence * Add type checking guard * Fixing typehint * Fix test for added `input_ids` in output * Add deprecations and "text_labels" to output * Adjust tests * Fix test * Update code examples * Minor docs and code improvement * Remove one-liner functions and rename class to CamelCase * Update docstring * Fixup
This commit is contained in:
committed by
GitHub
parent
42b2857b01
commit
099d93d2e9
@@ -56,25 +56,26 @@ Here's how to use the model for zero-shot object detection:
|
|||||||
>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image = Image.open(requests.get(image_url, stream=True).raw)
|
>>> image = Image.open(requests.get(image_url, stream=True).raw)
|
||||||
>>> # Check for cats and remote controls
|
>>> # Check for cats and remote controls
|
||||||
>>> text = "a cat. a remote control."
|
>>> text_labels = [["a cat", "a remote control"]]
|
||||||
|
|
||||||
>>> inputs = processor(images=image, text=text, return_tensors="pt").to(device)
|
>>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
|
||||||
>>> with torch.no_grad():
|
>>> with torch.no_grad():
|
||||||
... outputs = model(**inputs)
|
... outputs = model(**inputs)
|
||||||
|
|
||||||
>>> results = processor.post_process_grounded_object_detection(
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
... outputs,
|
... outputs,
|
||||||
... inputs.input_ids,
|
... threshold=0.4,
|
||||||
... box_threshold=0.4,
|
|
||||||
... text_threshold=0.3,
|
... text_threshold=0.3,
|
||||||
... target_sizes=[image.size[::-1]]
|
... target_sizes=[(image.height, image.width)]
|
||||||
... )
|
... )
|
||||||
>>> print(results)
|
>>> # Retrieve the first image result
|
||||||
[{'boxes': tensor([[344.6959, 23.1090, 637.1833, 374.2751],
|
>>> result = results[0]
|
||||||
[ 12.2666, 51.9145, 316.8582, 472.4392],
|
>>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
|
||||||
[ 38.5742, 70.0015, 176.7838, 118.1806]], device='cuda:0'),
|
... box = [round(x, 2) for x in box.tolist()]
|
||||||
'labels': ['a cat', 'a cat', 'a remote control'],
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}]
|
Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
|
||||||
|
Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
|
||||||
|
Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Grounded SAM
|
## Grounded SAM
|
||||||
|
|||||||
@@ -286,7 +286,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
|
|||||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||||
possible padding). You can use [`~GroundingDinoProcessor.post_process_object_detection`] to retrieve the
|
possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the
|
||||||
unnormalized bounding boxes.
|
unnormalized bounding boxes.
|
||||||
auxiliary_outputs (`List[Dict]`, *optional*):
|
auxiliary_outputs (`List[Dict]`, *optional*):
|
||||||
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||||
@@ -331,6 +331,8 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
|
|||||||
background).
|
background).
|
||||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||||
Logits of predicted bounding boxes coordinates in the first stage.
|
Logits of predicted bounding boxes coordinates in the first stage.
|
||||||
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Encoded candidate labels sequence. Used in processor to post process object detection result.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -351,6 +353,7 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
|
|||||||
encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||||
enc_outputs_class: Optional[torch.FloatTensor] = None
|
enc_outputs_class: Optional[torch.FloatTensor] = None
|
||||||
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
|
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
|
||||||
|
input_ids: Optional[torch.LongTensor] = None
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino
|
# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino
|
||||||
@@ -2546,30 +2549,41 @@ class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
|
|||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import AutoProcessor, GroundingDinoForObjectDetection
|
|
||||||
>>> from PIL import Image
|
|
||||||
>>> import requests
|
>>> import requests
|
||||||
|
|
||||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> import torch
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> from PIL import Image
|
||||||
>>> text = "a cat."
|
>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
|
>>> model_id = "IDEA-Research/grounding-dino-tiny"
|
||||||
>>> model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
|
>>> device = "cuda"
|
||||||
|
|
||||||
>>> inputs = processor(images=image, text=text, return_tensors="pt")
|
>>> processor = AutoProcessor.from_pretrained(model_id)
|
||||||
>>> outputs = model(**inputs)
|
>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
|
||||||
|
|
||||||
>>> # convert outputs (bounding boxes and class logits) to COCO API
|
>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> target_sizes = torch.tensor([image.size[::-1]])
|
>>> image = Image.open(requests.get(image_url, stream=True).raw)
|
||||||
>>> results = processor.image_processor.post_process_object_detection(
|
>>> # Check for cats and remote controls
|
||||||
... outputs, threshold=0.35, target_sizes=target_sizes
|
>>> text_labels = [["a cat", "a remote control"]]
|
||||||
... )[0]
|
|
||||||
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
>>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
|
||||||
... box = [round(i, 1) for i in box.tolist()]
|
>>> with torch.no_grad():
|
||||||
... print(f"Detected {label.item()} with confidence " f"{round(score.item(), 2)} at location {box}")
|
... outputs = model(**inputs)
|
||||||
Detected 1 with confidence 0.45 at location [344.8, 23.2, 637.4, 373.8]
|
|
||||||
Detected 1 with confidence 0.41 at location [11.9, 51.6, 316.6, 472.9]
|
>>> results = processor.post_process_grounded_object_detection(
|
||||||
|
... outputs,
|
||||||
|
... threshold=0.4,
|
||||||
|
... text_threshold=0.3,
|
||||||
|
... target_sizes=[(image.height, image.width)]
|
||||||
|
... )
|
||||||
|
>>> # Retrieve the first image result
|
||||||
|
>>> result = results[0]
|
||||||
|
>>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
|
||||||
|
... box = [round(x, 2) for x in box.tolist()]
|
||||||
|
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||||
|
Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
|
||||||
|
Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
|
||||||
|
Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
|
||||||
```"""
|
```"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
@@ -2639,13 +2653,10 @@ class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
if auxiliary_outputs is not None:
|
auxiliary_outputs = auxiliary_outputs if auxiliary_outputs is not None else []
|
||||||
output = (logits, pred_boxes) + auxiliary_outputs + outputs
|
output = [loss, loss_dict, logits, pred_boxes, *auxiliary_outputs, *outputs, input_ids]
|
||||||
else:
|
output = tuple(out for out in output if out is not None)
|
||||||
output = (logits, pred_boxes) + outputs
|
return output
|
||||||
tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
|
|
||||||
|
|
||||||
return tuple_outputs
|
|
||||||
|
|
||||||
dict_outputs = GroundingDinoObjectDetectionOutput(
|
dict_outputs = GroundingDinoObjectDetectionOutput(
|
||||||
loss=loss,
|
loss=loss,
|
||||||
@@ -2666,6 +2677,7 @@ class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
|
|||||||
init_reference_points=outputs.init_reference_points,
|
init_reference_points=outputs.init_reference_points,
|
||||||
enc_outputs_class=outputs.enc_outputs_class,
|
enc_outputs_class=outputs.enc_outputs_class,
|
||||||
enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
|
enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
|
||||||
|
input_ids=input_ids,
|
||||||
)
|
)
|
||||||
|
|
||||||
return dict_outputs
|
return dict_outputs
|
||||||
|
|||||||
@@ -17,7 +17,8 @@ Processor class for Grounding DINO.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
import warnings
|
||||||
|
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from ...image_processing_utils import BatchFeature
|
from ...image_processing_utils import BatchFeature
|
||||||
from ...image_transforms import center_to_corners_format
|
from ...image_transforms import center_to_corners_format
|
||||||
@@ -25,11 +26,15 @@ from ...image_utils import AnnotationFormat, ImageInput
|
|||||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||||
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
|
||||||
from ...utils import TensorType, is_torch_available
|
from ...utils import TensorType, is_torch_available
|
||||||
|
from ...utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
|
||||||
|
|
||||||
|
|
||||||
AnnotationType = Dict[str, Union[int, str, List[Dict]]]
|
AnnotationType = Dict[str, Union[int, str, List[Dict]]]
|
||||||
|
|
||||||
@@ -60,6 +65,42 @@ def get_phrases_from_posmap(posmaps, input_ids):
|
|||||||
return token_ids
|
return token_ids
|
||||||
|
|
||||||
|
|
||||||
|
def _is_list_of_candidate_labels(text) -> bool:
|
||||||
|
"""Check that text is list/tuple of strings and each string is a candidate label and not merged candidate labels text.
|
||||||
|
Merged candidate labels text is a string with candidate labels separated by a dot.
|
||||||
|
"""
|
||||||
|
if isinstance(text, (list, tuple)):
|
||||||
|
return all(isinstance(t, str) and "." not in t for t in text)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_candidate_labels_text(text: List[str]) -> str:
|
||||||
|
"""
|
||||||
|
Merge candidate labels text into a single string. Ensure all labels are lowercase.
|
||||||
|
For example, ["A cat", "a dog"] -> "a cat. a dog."
|
||||||
|
"""
|
||||||
|
labels = [t.strip().lower() for t in text] # ensure lowercase
|
||||||
|
merged_labels_str = ". ".join(labels) + "." # join with dot and add a dot at the end
|
||||||
|
return merged_labels_str
|
||||||
|
|
||||||
|
|
||||||
|
class DictWithDeprecationWarning(dict):
|
||||||
|
message = (
|
||||||
|
"The key `labels` is will return integer ids in `GroundingDinoProcessor.post_process_grounded_object_detection` "
|
||||||
|
"output since v4.51.0. Use `text_labels` instead to retrieve string object names."
|
||||||
|
)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
if key == "labels":
|
||||||
|
warnings.warn(self.message, FutureWarning)
|
||||||
|
return super().__getitem__(key)
|
||||||
|
|
||||||
|
def get(self, key, *args, **kwargs):
|
||||||
|
if key == "labels":
|
||||||
|
warnings.warn(self.message, FutureWarning)
|
||||||
|
return super().get(key, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
|
class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
|
||||||
annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: Optional[bool]
|
||||||
@@ -120,7 +161,15 @@ class GroundingDinoProcessor(ProcessorMixin):
|
|||||||
This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
|
This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
|
||||||
[`BertTokenizerFast.__call__`] to prepare text for the model.
|
[`BertTokenizerFast.__call__`] to prepare text for the model.
|
||||||
|
|
||||||
Please refer to the docstring of the above two methods for more information.
|
Args:
|
||||||
|
images (`ImageInput`, `List[ImageInput]`, *optional*):
|
||||||
|
The image or batch of images to be processed. The image might be either PIL image, numpy array or a torch tensor.
|
||||||
|
text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
|
||||||
|
Candidate labels to be detected on the image. The text might be one of the following:
|
||||||
|
- A list of candidate labels (strings) to be detected on the image (e.g. ["a cat", "a dog"]).
|
||||||
|
- A batch of candidate labels to be detected on the batch of images (e.g. [["a cat", "a dog"], ["a car", "a person"]]).
|
||||||
|
- A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
|
||||||
|
- A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
|
||||||
"""
|
"""
|
||||||
if images is None and text is None:
|
if images is None and text is None:
|
||||||
raise ValueError("You must specify either text or images.")
|
raise ValueError("You must specify either text or images.")
|
||||||
@@ -138,6 +187,7 @@ class GroundingDinoProcessor(ProcessorMixin):
|
|||||||
encoding_image_processor = BatchFeature()
|
encoding_image_processor = BatchFeature()
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
|
text = self._preprocess_input_text(text)
|
||||||
text_encoding = self.tokenizer(
|
text_encoding = self.tokenizer(
|
||||||
text=text,
|
text=text,
|
||||||
**output_kwargs["text_kwargs"],
|
**output_kwargs["text_kwargs"],
|
||||||
@@ -149,6 +199,23 @@ class GroundingDinoProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
return text_encoding
|
return text_encoding
|
||||||
|
|
||||||
|
def _preprocess_input_text(self, text):
|
||||||
|
"""
|
||||||
|
Preprocess input text to ensure that labels are in the correct format for the model.
|
||||||
|
If the text is a list of candidate labels, merge the candidate labels into a single string,
|
||||||
|
for example, ["a cat", "a dog"] -> "a cat. a dog.". In case candidate labels are already in a form of
|
||||||
|
"a cat. a dog.", the text is returned as is.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if _is_list_of_candidate_labels(text):
|
||||||
|
text = _merge_candidate_labels_text(text)
|
||||||
|
|
||||||
|
# for batched input
|
||||||
|
elif isinstance(text, (list, tuple)) and all(_is_list_of_candidate_labels(t) for t in text):
|
||||||
|
text = [_merge_candidate_labels_text(sample) for sample in text]
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
# Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
|
# Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -172,13 +239,15 @@ class GroundingDinoProcessor(ProcessorMixin):
|
|||||||
image_processor_input_names = self.image_processor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
|
||||||
|
@deprecate_kwarg("box_threshold", new_name="threshold", version="4.51.0")
|
||||||
def post_process_grounded_object_detection(
|
def post_process_grounded_object_detection(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs: "GroundingDinoObjectDetectionOutput",
|
||||||
input_ids,
|
input_ids: Optional[TensorType] = None,
|
||||||
box_threshold: float = 0.25,
|
threshold: float = 0.25,
|
||||||
text_threshold: float = 0.25,
|
text_threshold: float = 0.25,
|
||||||
target_sizes: Union[TensorType, List[Tuple]] = None,
|
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
|
||||||
|
text_labels: Optional[List[List[str]]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
|
||||||
@@ -187,32 +256,38 @@ class GroundingDinoProcessor(ProcessorMixin):
|
|||||||
Args:
|
Args:
|
||||||
outputs ([`GroundingDinoObjectDetectionOutput`]):
|
outputs ([`GroundingDinoObjectDetectionOutput`]):
|
||||||
Raw outputs of the model.
|
Raw outputs of the model.
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
The token ids of the input text.
|
The token ids of the input text. If not provided will be taken from the model output.
|
||||||
box_threshold (`float`, *optional*, defaults to 0.25):
|
threshold (`float`, *optional*, defaults to 0.25):
|
||||||
Score threshold to keep object detection predictions.
|
Threshold to keep object detection predictions based on confidence score.
|
||||||
text_threshold (`float`, *optional*, defaults to 0.25):
|
text_threshold (`float`, *optional*, defaults to 0.25):
|
||||||
Score threshold to keep text detection predictions.
|
Score threshold to keep text detection predictions.
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||||
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
|
||||||
|
text_labels (`List[List[str]]`, *optional*):
|
||||||
|
List of candidate labels to be detected on each image. At the moment it's *NOT used*, but required
|
||||||
|
to be in signature for the zero-shot object detection pipeline. Text labels are instead extracted
|
||||||
|
from the `input_ids` tensor provided in `outputs`.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
`List[Dict]`: A list of dictionaries, each dictionary containing the
|
||||||
in the batch as predicted by the model.
|
- **scores**: tensor of confidence scores for detected objects
|
||||||
|
- **boxes**: tensor of bounding boxes in [x0, y0, x1, y1] format
|
||||||
|
- **labels**: list of text labels for each detected object (will be replaced with integer ids in v4.51.0)
|
||||||
|
- **text_labels**: list of text labels for detected objects
|
||||||
"""
|
"""
|
||||||
logits, boxes = outputs.logits, outputs.pred_boxes
|
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
|
||||||
|
input_ids = input_ids if input_ids is not None else outputs.input_ids
|
||||||
|
|
||||||
if target_sizes is not None:
|
if target_sizes is not None and len(target_sizes) != len(batch_logits):
|
||||||
if len(logits) != len(target_sizes):
|
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
probs = torch.sigmoid(logits) # (batch_size, num_queries, 256)
|
batch_probs = torch.sigmoid(batch_logits) # (batch_size, num_queries, 256)
|
||||||
scores = torch.max(probs, dim=-1)[0] # (batch_size, num_queries)
|
batch_scores = torch.max(batch_probs, dim=-1)[0] # (batch_size, num_queries)
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
# Convert to [x0, y0, x1, y1] format
|
||||||
boxes = center_to_corners_format(boxes)
|
batch_boxes = center_to_corners_format(batch_boxes)
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
if target_sizes is not None:
|
if target_sizes is not None:
|
||||||
@@ -222,17 +297,30 @@ class GroundingDinoProcessor(ProcessorMixin):
|
|||||||
else:
|
else:
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(batch_boxes.device)
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
batch_boxes = batch_boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for idx, (s, b, p) in enumerate(zip(scores, boxes, probs)):
|
for idx, (scores, boxes, probs) in enumerate(zip(batch_scores, batch_boxes, batch_probs)):
|
||||||
score = s[s > box_threshold]
|
keep = scores > threshold
|
||||||
box = b[s > box_threshold]
|
scores = scores[keep]
|
||||||
prob = p[s > box_threshold]
|
boxes = boxes[keep]
|
||||||
|
|
||||||
|
# extract text labels
|
||||||
|
prob = probs[keep]
|
||||||
label_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
|
label_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
|
||||||
label = self.batch_decode(label_ids)
|
objects_text_labels = self.batch_decode(label_ids)
|
||||||
results.append({"scores": score, "labels": label, "boxes": box})
|
|
||||||
|
result = DictWithDeprecationWarning(
|
||||||
|
{
|
||||||
|
"scores": scores,
|
||||||
|
"boxes": boxes,
|
||||||
|
"text_labels": objects_text_labels,
|
||||||
|
# TODO: @pavel, set labels to None since v4.51.0 or find a way to extract ids
|
||||||
|
"labels": objects_text_labels,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -322,9 +322,9 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
|
|||||||
# loss is at first position
|
# loss is at first position
|
||||||
if "labels" in inputs_dict:
|
if "labels" in inputs_dict:
|
||||||
correct_outlen += 1 # loss is added to beginning
|
correct_outlen += 1 # loss is added to beginning
|
||||||
# Object Detection model returns pred_logits and pred_boxes
|
# Object Detection model returns pred_logits and pred_boxes and input_ids
|
||||||
if model_class.__name__ == "GroundingDinoForObjectDetection":
|
if model_class.__name__ == "GroundingDinoForObjectDetection":
|
||||||
correct_outlen += 2
|
correct_outlen += 3
|
||||||
|
|
||||||
self.assertEqual(out_len, correct_outlen)
|
self.assertEqual(out_len, correct_outlen)
|
||||||
|
|
||||||
@@ -653,7 +653,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = processor.image_processor.post_process_object_detection(
|
results = processor.image_processor.post_process_object_detection(
|
||||||
outputs, threshold=0.35, target_sizes=[image.size[::-1]]
|
outputs, threshold=0.35, target_sizes=[(image.height, image.width)]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device)
|
expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device)
|
||||||
expected_slice_boxes = torch.tensor([344.8143, 23.1796, 637.4004, 373.8295]).to(torch_device)
|
expected_slice_boxes = torch.tensor([344.8143, 23.1796, 637.4004, 373.8295]).to(torch_device)
|
||||||
@@ -667,14 +667,14 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
|
|||||||
results = processor.post_process_grounded_object_detection(
|
results = processor.post_process_grounded_object_detection(
|
||||||
outputs=outputs,
|
outputs=outputs,
|
||||||
input_ids=encoding.input_ids,
|
input_ids=encoding.input_ids,
|
||||||
box_threshold=0.35,
|
threshold=0.35,
|
||||||
text_threshold=0.3,
|
text_threshold=0.3,
|
||||||
target_sizes=[image.size[::-1]],
|
target_sizes=[(image.height, image.width)],
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3))
|
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3))
|
||||||
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
|
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
|
||||||
self.assertListEqual(results["labels"], expected_labels)
|
self.assertListEqual(results["text_labels"], expected_labels)
|
||||||
|
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
|
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
|
||||||
@@ -706,11 +706,11 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# assert postprocessing
|
# assert postprocessing
|
||||||
results_cpu = processor.image_processor.post_process_object_detection(
|
results_cpu = processor.image_processor.post_process_object_detection(
|
||||||
cpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]]
|
cpu_outputs, threshold=0.35, target_sizes=[(image.height, image.width)]
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
result_gpu = processor.image_processor.post_process_object_detection(
|
result_gpu = processor.image_processor.post_process_object_detection(
|
||||||
gpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]]
|
gpu_outputs, threshold=0.35, target_sizes=[(image.height, image.width)]
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
self.assertTrue(torch.allclose(results_cpu["scores"], result_gpu["scores"].cpu(), atol=1e-3))
|
self.assertTrue(torch.allclose(results_cpu["scores"], result_gpu["scores"].cpu(), atol=1e-3))
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -77,6 +78,20 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
self.embed_dim = 5
|
self.embed_dim = 5
|
||||||
self.seq_length = 5
|
self.seq_length = 5
|
||||||
|
|
||||||
|
def prepare_text_inputs(self, batch_size: Optional[int] = None):
|
||||||
|
labels = ["a cat", "remote control"]
|
||||||
|
labels_longer = ["a person", "a car", "a dog", "a cat"]
|
||||||
|
|
||||||
|
if batch_size is None:
|
||||||
|
return labels
|
||||||
|
|
||||||
|
if batch_size < 1:
|
||||||
|
raise ValueError("batch_size must be greater than 0")
|
||||||
|
|
||||||
|
if batch_size == 1:
|
||||||
|
return [labels]
|
||||||
|
return [labels, labels_longer] + [labels] * (batch_size - 2)
|
||||||
|
|
||||||
# Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
|
# Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
|
||||||
def get_tokenizer(self, **kwargs):
|
def get_tokenizer(self, **kwargs):
|
||||||
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -98,6 +113,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
return GroundingDinoObjectDetectionOutput(
|
return GroundingDinoObjectDetectionOutput(
|
||||||
pred_boxes=torch.rand(self.batch_size, self.num_queries, 4),
|
pred_boxes=torch.rand(self.batch_size, self.num_queries, 4),
|
||||||
logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
|
logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim),
|
||||||
|
input_ids=self.get_fake_grounding_dino_input_ids(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_fake_grounding_dino_input_ids(self):
|
def get_fake_grounding_dino_input_ids(self):
|
||||||
@@ -111,14 +127,11 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
grounding_dino_output = self.get_fake_grounding_dino_output()
|
grounding_dino_output = self.get_fake_grounding_dino_output()
|
||||||
grounding_dino_input_ids = self.get_fake_grounding_dino_input_ids()
|
|
||||||
|
|
||||||
post_processed = processor.post_process_grounded_object_detection(
|
post_processed = processor.post_process_grounded_object_detection(grounding_dino_output)
|
||||||
grounding_dino_output, grounding_dino_input_ids
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(len(post_processed), self.batch_size)
|
self.assertEqual(len(post_processed), self.batch_size)
|
||||||
self.assertEqual(list(post_processed[0].keys()), ["scores", "labels", "boxes"])
|
self.assertEqual(list(post_processed[0].keys()), ["scores", "boxes", "text_labels", "labels"])
|
||||||
self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4))
|
self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4))
|
||||||
self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,))
|
self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,))
|
||||||
|
|
||||||
@@ -248,3 +261,26 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
inputs = processor(text=input_str, images=image_input)
|
inputs = processor(text=input_str, images=image_input)
|
||||||
|
|
||||||
self.assertListEqual(list(inputs.keys()), processor.model_input_names)
|
self.assertListEqual(list(inputs.keys()), processor.model_input_names)
|
||||||
|
|
||||||
|
def test_text_preprocessing_equivalence(self):
|
||||||
|
processor = GroundingDinoProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
# check for single input
|
||||||
|
formatted_labels = "a cat. a remote control."
|
||||||
|
labels = ["a cat", "a remote control"]
|
||||||
|
inputs1 = processor(text=formatted_labels, return_tensors="pt")
|
||||||
|
inputs2 = processor(text=labels, return_tensors="pt")
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(inputs1["input_ids"], inputs2["input_ids"]),
|
||||||
|
f"Input ids are not equal for single input: {inputs1['input_ids']} != {inputs2['input_ids']}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# check for batched input
|
||||||
|
formatted_labels = ["a cat. a remote control.", "a car. a person."]
|
||||||
|
labels = [["a cat", "a remote control"], ["a car", "a person"]]
|
||||||
|
inputs1 = processor(text=formatted_labels, return_tensors="pt", padding=True)
|
||||||
|
inputs2 = processor(text=labels, return_tensors="pt", padding=True)
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(inputs1["input_ids"], inputs2["input_ids"]),
|
||||||
|
f"Input ids are not equal for batched input: {inputs1['input_ids']} != {inputs2['input_ids']}",
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user