Update bounding box format everywhere (#27944)

Update formats
This commit is contained in:
NielsRogge
2023-12-11 19:03:42 +01:00
committed by GitHub
parent 54d0b1c278
commit 67b1335cb9
15 changed files with 16 additions and 16 deletions

View File

@@ -146,7 +146,7 @@ As a summary, consider the following table:
| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] | | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | | | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | |
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) | | **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
| **Postprocessing** (i.e. converting the output of the model to COCO API) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] | | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` | | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
In short, one should prepare the data either in COCO detection or COCO panoptic format, then use In short, one should prepare the data either in COCO detection or COCO panoptic format, then use

View File

@@ -56,7 +56,7 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]]) >>> target_sizes = torch.Tensor([image.size[::-1]])
>>> # Convert outputs (bounding boxes and class logits) to COCO API >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) >>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
>>> text = texts[i] >>> text = texts[i]

View File

@@ -55,7 +55,7 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]]) >>> target_sizes = torch.Tensor([image.size[::-1]])
>>> # Convert outputs (bounding boxes and class logits) to COCO API >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) >>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
>>> text = texts[i] >>> text = texts[i]

View File

@@ -512,7 +512,7 @@ Finally, load the metrics and run the evaluation.
... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) ... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) ... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api ... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
... module.add(prediction=results, reference=labels) ... module.add(prediction=results, reference=labels)
... del batch ... del batch

View File

@@ -518,7 +518,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) ... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) ... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api ... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
... module.add(prediction=results, reference=labels) ... module.add(prediction=results, reference=labels)
... del batch ... del batch

View File

@@ -504,7 +504,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로
... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) ... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) ... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api ... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
... module.add(prediction=results, reference=labels) ... module.add(prediction=results, reference=labels)
... del batch ... del batch

View File

@@ -1330,8 +1330,8 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
# POSTPROCESSING METHODS - TODO: add support for other frameworks # POSTPROCESSING METHODS - TODO: add support for other frameworks
def post_process(self, outputs, target_sizes): def post_process(self, outputs, target_sizes):
""" """
Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax).
supports PyTorch. Only supports PyTorch.
Args: Args:
outputs ([`ConditionalDetrObjectDetectionOutput`]): outputs ([`ConditionalDetrObjectDetectionOutput`]):

View File

@@ -1805,7 +1805,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
... 0 ... 0

View File

@@ -1900,7 +1900,7 @@ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
>>> inputs = image_processor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
... 0 ... 0

View File

@@ -1851,7 +1851,7 @@ class DetaForObjectDetection(DetaPreTrainedModel):
>>> inputs = image_processor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
... 0 ... 0

View File

@@ -1535,7 +1535,7 @@ class DetrForObjectDetection(DetrPreTrainedModel):
>>> inputs = image_processor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
... 0 ... 0

View File

@@ -1576,7 +1576,7 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
>>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]]) >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
>>> # Convert outputs (bounding boxes and class logits) to COCO API >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_image_guided_detection( >>> results = processor.post_process_image_guided_detection(
... outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes ... outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
... ) ... )

View File

@@ -1517,7 +1517,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
... outputs = model.image_guided_detection(**inputs) ... outputs = model.image_guided_detection(**inputs)
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]]) >>> target_sizes = torch.Tensor([image.size[::-1]])
>>> # Convert outputs (bounding boxes and class logits) to COCO API >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_image_guided_detection( >>> results = processor.post_process_image_guided_detection(
... outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes ... outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
... ) ... )

View File

@@ -1431,7 +1431,7 @@ class TableTransformerForObjectDetection(TableTransformerPreTrainedModel):
>>> inputs = image_processor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
... 0 ... 0

View File

@@ -756,7 +756,7 @@ class YolosForObjectDetection(YolosPreTrainedModel):
>>> inputs = image_processor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
... 0 ... 0