From 67b1335cb9b3cb39e290c0afad424816daf7e357 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 11 Dec 2023 19:03:42 +0100 Subject: [PATCH] Update bounding box format everywhere (#27944) Update formats --- docs/source/en/model_doc/detr.md | 2 +- docs/source/en/model_doc/owlv2.md | 2 +- docs/source/en/model_doc/owlvit.md | 2 +- docs/source/en/tasks/object_detection.md | 2 +- docs/source/ja/tasks/object_detection.md | 2 +- docs/source/ko/tasks/object_detection.md | 2 +- .../conditional_detr/image_processing_conditional_detr.py | 4 ++-- .../models/conditional_detr/modeling_conditional_detr.py | 2 +- .../models/deformable_detr/modeling_deformable_detr.py | 2 +- src/transformers/models/deta/modeling_deta.py | 2 +- src/transformers/models/detr/modeling_detr.py | 2 +- src/transformers/models/owlv2/modeling_owlv2.py | 2 +- src/transformers/models/owlvit/modeling_owlvit.py | 2 +- .../models/table_transformer/modeling_table_transformer.py | 2 +- src/transformers/models/yolos/modeling_yolos.py | 2 +- 15 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md index c36bd4380e..60937b6012 100644 --- a/docs/source/en/model_doc/detr.md +++ b/docs/source/en/model_doc/detr.md @@ -146,7 +146,7 @@ As a summary, consider the following table: | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] | | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | | | **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) | -| **Postprocessing** (i.e. converting the output of the model to COCO API) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] | +| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] | | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` | In short, one should prepare the data either in COCO detection or COCO panoptic format, then use diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 12000af9ed..75fab0853a 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -56,7 +56,7 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> target_sizes = torch.Tensor([image.size[::-1]]) ->>> # Convert outputs (bounding boxes and class logits) to COCO API +>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax) >>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> text = texts[i] diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md index 0ba26eeb37..c40d3a9e7a 100644 --- a/docs/source/en/model_doc/owlvit.md +++ b/docs/source/en/model_doc/owlvit.md @@ -55,7 +55,7 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> target_sizes = torch.Tensor([image.size[::-1]]) ->>> # Convert outputs (bounding boxes and class logits) to COCO API +>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> text = texts[i] diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 7511ee66dd..6f53655e7c 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -512,7 +512,7 @@ Finally, load the metrics and run the evaluation. ... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) ... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) -... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api +... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax) ... module.add(prediction=results, reference=labels) ... del batch diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md index 845a259cd7..389e7bdf2f 100644 --- a/docs/source/ja/tasks/object_detection.md +++ b/docs/source/ja/tasks/object_detection.md @@ -518,7 +518,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ ... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) ... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) -... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api +... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax) ... module.add(prediction=results, reference=labels) ... del batch diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md index ca384d0381..0076bba6f8 100644 --- a/docs/source/ko/tasks/object_detection.md +++ b/docs/source/ko/tasks/object_detection.md @@ -504,7 +504,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 ... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) ... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) -... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api +... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax) ... module.add(prediction=results, reference=labels) ... del batch diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 3ec0696430..23e493e08b 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -1330,8 +1330,8 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): # POSTPROCESSING METHODS - TODO: add support for other frameworks def post_process(self, outputs, target_sizes): """ - Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only - supports PyTorch. + Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax). + Only supports PyTorch. Args: outputs ([`ConditionalDetrObjectDetectionOutput`]): diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index fd9a9d7e6f..d903abffaf 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -1805,7 +1805,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel): >>> outputs = model(**inputs) - >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> target_sizes = torch.tensor([image.size[::-1]]) >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ ... 0 diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 2a93fc2aa0..3767eef039 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1900,7 +1900,7 @@ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel): >>> inputs = image_processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) - >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> target_sizes = torch.tensor([image.size[::-1]]) >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ ... 0 diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index 19f0250e6f..8362b49eee 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -1851,7 +1851,7 @@ class DetaForObjectDetection(DetaPreTrainedModel): >>> inputs = image_processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) - >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> target_sizes = torch.tensor([image.size[::-1]]) >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ ... 0 diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 9b73d0e651..e0680e1874 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -1535,7 +1535,7 @@ class DetrForObjectDetection(DetrPreTrainedModel): >>> inputs = image_processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) - >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> target_sizes = torch.tensor([image.size[::-1]]) >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ ... 0 diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 6cc996966b..5146fbb89d 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -1576,7 +1576,7 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel): >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]]) - >>> # Convert outputs (bounding boxes and class logits) to COCO API + >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> results = processor.post_process_image_guided_detection( ... outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes ... ) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 8c502c410d..b8e8a36fec 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -1517,7 +1517,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel): ... outputs = model.image_guided_detection(**inputs) >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] >>> target_sizes = torch.Tensor([image.size[::-1]]) - >>> # Convert outputs (bounding boxes and class logits) to COCO API + >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> results = processor.post_process_image_guided_detection( ... outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes ... ) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 92aac58e74..81afcdc9c1 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -1431,7 +1431,7 @@ class TableTransformerForObjectDetection(TableTransformerPreTrainedModel): >>> inputs = image_processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) - >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> target_sizes = torch.tensor([image.size[::-1]]) >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ ... 0 diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index b2f0ca0db5..65ffbfced4 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -756,7 +756,7 @@ class YolosForObjectDetection(YolosPreTrainedModel): >>> inputs = image_processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) - >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> target_sizes = torch.tensor([image.size[::-1]]) >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ ... 0