From 427b62ed1a4649539988225b841d158187ab4850 Mon Sep 17 00:00:00 2001 From: David Zhang <45090902+OnTheThirdDay@users.noreply.github.com> Date: Wed, 20 Nov 2024 02:49:25 +1100 Subject: [PATCH] Fix post process function called in the instance segmentation example of mask2former (#34588) * Fix post process function called in the instance segmentation example of mask2former * fix description and additional notes for post_process_instance_segmentation of maskformers * remove white space in maskformers post_process_instance_segmentation doc * change image.size[::-1] to height and width for clarity in segmentation examples --- docs/source/en/model_doc/rt_detr.md | 2 +- examples/pytorch/instance-segmentation/README.md | 2 +- .../models/mask2former/image_processing_mask2former.py | 8 +++++--- .../models/mask2former/modeling_mask2former.py | 8 ++++---- .../models/maskformer/image_processing_maskformer.py | 8 +++++--- src/transformers/models/maskformer/modeling_maskformer.py | 4 ++-- src/transformers/models/oneformer/modeling_oneformer.py | 6 +++--- src/transformers/models/seggpt/modeling_seggpt.py | 2 +- 8 files changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 8ad220dc4b..6a1545e123 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -57,7 +57,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ >>> with torch.no_grad(): ... outputs = model(**inputs) ->>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3) +>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3) >>> for result in results: ... for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): diff --git a/examples/pytorch/instance-segmentation/README.md b/examples/pytorch/instance-segmentation/README.md index 72eb5a5bef..339d759152 100644 --- a/examples/pytorch/instance-segmentation/README.md +++ b/examples/pytorch/instance-segmentation/README.md @@ -148,7 +148,7 @@ with torch.no_grad(): outputs = model(**inputs) # Post-process outputs -outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[image.size[::-1]]) +outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[(image.height, image.width)]) print("Mask shape: ", outputs[0]["segmentation"].shape) print("Mask values: ", outputs[0]["segmentation"].unique()) diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 28ad600295..555ee6e956 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -1034,7 +1034,8 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ) -> List[Dict]: """ Converts the output of [`Mask2FormerForUniversalSegmentationOutput`] into instance segmentation predictions. - Only supports PyTorch. + Only supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps + to `True` to get the correct segmentation result. Args: outputs ([`Mask2FormerForUniversalSegmentation`]): @@ -1056,9 +1057,10 @@ class Mask2FormerImageProcessor(BaseImageProcessor): (one per detected instance). Returns: `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: - - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or + - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to - `True`. Set to `None` if no mask if found above `threshold`. + `True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`. + Set to `None` if no mask if found above `threshold`. - **segments_info** -- A dictionary that contains additional information on each segment. - **id** -- An integer representing the `segment_id`. - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 4cc96b1652..e91d035754 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -2428,8 +2428,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel): >>> masks_queries_logits = outputs.masks_queries_logits >>> # Perform post-processing to get instance segmentation map - >>> pred_instance_map = image_processor.post_process_semantic_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + >>> pred_instance_map = image_processor.post_process_instance_segmentation( + ... outputs, target_sizes=[(image.height, image.width)] ... )[0] >>> print(pred_instance_map.shape) torch.Size([480, 640]) @@ -2462,7 +2462,7 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel): >>> # Perform post-processing to get semantic segmentation map >>> pred_semantic_map = image_processor.post_process_semantic_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + ... outputs, target_sizes=[(image.height, image.width)] ... )[0] >>> print(pred_semantic_map.shape) torch.Size([512, 683]) @@ -2496,7 +2496,7 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel): >>> # Perform post-processing to get panoptic segmentation map >>> pred_panoptic_map = image_processor.post_process_panoptic_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + ... outputs, target_sizes=[(image.height, image.width)] ... )[0]["segmentation"] >>> print(pred_panoptic_map.shape) torch.Size([338, 676]) diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index aeec214884..f4eb1bb56f 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -1080,7 +1080,8 @@ class MaskFormerImageProcessor(BaseImageProcessor): ) -> List[Dict]: """ Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into instance segmentation predictions. Only - supports PyTorch. + supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps + to `True` to get the correct segmentation result. Args: outputs ([`MaskFormerForInstanceSegmentation`]): @@ -1102,9 +1103,10 @@ class MaskFormerImageProcessor(BaseImageProcessor): (one per detected instance). Returns: `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: - - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or + - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to - `True`. Set to `None` if no mask if found above `threshold`. + `True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`. + Set to `None` if no mask if found above `threshold`. - **segments_info** -- A dictionary that contains additional information on each segment. - **id** -- An integer representing the `segment_id`. - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index cd6ef28566..a8398ec972 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -1780,7 +1780,7 @@ class MaskFormerForInstanceSegmentation(MaskFormerPreTrainedModel): >>> # you can pass them to image_processor for postprocessing >>> predicted_semantic_map = image_processor.post_process_semantic_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + ... outputs, target_sizes=[(image.height, image.width)] ... )[0] >>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs) @@ -1810,7 +1810,7 @@ class MaskFormerForInstanceSegmentation(MaskFormerPreTrainedModel): >>> masks_queries_logits = outputs.masks_queries_logits >>> # you can pass them to image_processor for postprocessing - >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0] + >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(image.height, image.width)])[0] >>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs) >>> predicted_panoptic_map = result["segmentation"] diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index aeeccb68a9..e237467c24 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -3161,7 +3161,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel): >>> # you can pass them to processor for semantic postprocessing >>> predicted_semantic_map = processor.post_process_semantic_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + ... outputs, target_sizes=[(image.height, image.width)] ... )[0] >>> f"👉 Semantic Predictions Shape: {list(predicted_semantic_map.shape)}" '👉 Semantic Predictions Shape: [512, 683]' @@ -3178,7 +3178,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel): >>> # you can pass them to processor for instance postprocessing >>> predicted_instance_map = processor.post_process_instance_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + ... outputs, target_sizes=[(image.height, image.width)] ... )[0]["segmentation"] >>> f"👉 Instance Predictions Shape: {list(predicted_instance_map.shape)}" '👉 Instance Predictions Shape: [512, 683]' @@ -3195,7 +3195,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel): >>> # you can pass them to processor for panoptic postprocessing >>> predicted_panoptic_map = processor.post_process_panoptic_segmentation( - ... outputs, target_sizes=[image.size[::-1]] + ... outputs, target_sizes=[(image.height, image.width)] ... )[0]["segmentation"] >>> f"👉 Panoptic Predictions Shape: {list(predicted_panoptic_map.shape)}" '👉 Panoptic Predictions Shape: [512, 683]' diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index 174aeaad00..c0f1f24a31 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -962,7 +962,7 @@ class SegGptForImageSegmentation(SegGptPreTrainedModel): >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt") >>> outputs = model(**inputs) - >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image_input.size[::-1]])[0] + >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0] >>> print(list(result.shape)) [170, 297] ```