From 983451a13e5378908866309647afc94761fa8572 Mon Sep 17 00:00:00 2001
From: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
Date: Fri, 7 Oct 2022 23:34:41 +0300
Subject: [PATCH] Improve and fix ImageSegmentationPipeline (#19367)

- Fixes the image segmentation pipeline test failures caused by changes to the postprocessing methods of supported models
- Updates the ImageSegmentationPipeline tests
- Improves docs, adds 'task' argument to optionally perform semantic, instance or panoptic segmentation
---
 .../pipelines/image_segmentation.py           | 138 +++++++++---------
 .../test_pipelines_image_segmentation.py      | 114 +++++++++------
 2 files changed, 143 insertions(+), 109 deletions(-)

diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index a33095cfc2..690247f6e4 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -12,9 +12,6 @@ if is_vision_available():
     from ..image_utils import load_image
 
 if is_torch_available():
-    import torch
-    from torch import nn
-
     from ..models.auto.modeling_auto import (
         MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
         MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
@@ -59,13 +56,15 @@ class ImageSegmentationPipeline(Pipeline):
 
     def _sanitize_parameters(self, **kwargs):
         postprocess_kwargs = {}
+        if "task" in kwargs:
+            postprocess_kwargs["task"] = kwargs["task"]
         if "threshold" in kwargs:
             postprocess_kwargs["threshold"] = kwargs["threshold"]
-        if "mask_threshold" in kwargs:
-            postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
+        if "overlap_mask_area_threshold" in kwargs:
+            postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
         return {}, {}, postprocess_kwargs
 
-    def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
+    def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
         Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
 
@@ -79,30 +78,34 @@ class ImageSegmentationPipeline(Pipeline):
 
                 The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            task (`str`, defaults to `semantic`):
+                Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
+                capabilities.
             threshold (`float`, *optional*, defaults to 0.9):
-                The probability necessary to make a prediction.
-            mask_threshold (`float`, *optional*, defaults to 0.5):
-                Threshold to use when turning the predicted masks into binary values.
+                Probability threshold to filter out predicted masks.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
+                Mask overlap threshold to eliminate small, disconnected segments.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
             list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
             corresponding to each image.
 
-            The dictionaries contain the following keys:
+            The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
+            the following keys:
 
             - **label** (`str`) -- The class label identified by the model.
-            - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the
-              image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is.
+            - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
+              the original image. Returns a mask filled with zeros if no object is found.
             - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
               "object" described by the label and the mask.
         """
 
-        return super().__call__(*args, **kwargs)
+        return super().__call__(images, **kwargs)
 
     def preprocess(self, image):
         image = load_image(image)
-        target_size = torch.IntTensor([[image.height, image.width]])
+        target_size = [(image.height, image.width)]
         inputs = self.feature_extractor(images=[image], return_tensors="pt")
         inputs["target_size"] = target_size
         return inputs
@@ -113,66 +116,65 @@ class ImageSegmentationPipeline(Pipeline):
         model_outputs["target_size"] = target_size
         return model_outputs
 
-    def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
-        if hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
+    def postprocess(self, model_outputs, task="semantic", threshold=0.9, overlap_mask_area_threshold=0.5):
+        if task == "instance" and hasattr(self.feature_extractor, "post_process_instance_segmentation"):
             outputs = self.feature_extractor.post_process_panoptic_segmentation(
-                model_outputs, object_mask_threshold=threshold
+                model_outputs,
+                threshold=threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_sizes=model_outputs["target_size"],
             )[0]
+
             annotation = []
             segmentation = outputs["segmentation"]
-            for segment in outputs["segments"]:
-                mask = (segmentation == segment["id"]) * 255
-                mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
-                label = self.model.config.id2label[segment["label_id"]]
-                annotation.append({"mask": mask, "label": label, "score": None})
-        elif hasattr(self.feature_extractor, "post_process_segmentation"):
-            # Panoptic
-            raw_annotations = self.feature_extractor.post_process_segmentation(
-                model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
-            )
-            raw_annotation = raw_annotations[0]
-            raw_annotation["masks"] *= 255  # [0,1] -> [0,255] black and white pixels
-            raw_annotation["scores"] = raw_annotation["scores"].tolist()
-            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
-            raw_annotation["masks"] = [
-                Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"]
-            ]
-            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
-            keys = ["score", "label", "mask"]
-            annotation = [
-                dict(zip(keys, vals))
-                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
-            ]
-        else:
-            # Default logits
-            logits = model_outputs.logits
-            logits = logits.softmax(dim=1)
-            if len(logits.shape) != 4:
-                raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}")
-            batch_size, num_labels, height, width = logits.shape
-            expected_num_labels = len(self.model.config.id2label)
-            if num_labels != expected_num_labels:
-                raise ValueError(
-                    f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}"
-                )
-            size = model_outputs["target_size"].squeeze(0).tolist()
-            logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False)
-            classes = logits_reshaped.argmax(dim=1)[0]
+
+            if len(outputs["segments_info"]) == 0:
+                mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
+                annotation.append({"mask": mask, "label": None, "score": 0.0})
+            else:
+                for segment in outputs["segments_info"]:
+                    mask = (segmentation == segment["id"]) * 255
+                    mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+                    label = self.model.config.id2label[segment["label_id"]]
+                    score = segment["score"]
+                    annotation.append({"mask": mask, "label": label, "score": score})
+
+        elif task == "panoptic" and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
+            outputs = self.feature_extractor.post_process_panoptic_segmentation(
+                model_outputs,
+                threshold=threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_sizes=model_outputs["target_size"],
+            )[0]
+
             annotation = []
+            segmentation = outputs["segmentation"]
 
-            for label_id in range(num_labels):
-                label = self.model.config.id2label[label_id]
-                mask = classes == label_id
-                mask_sum = mask.sum()
+            if len(outputs["segments_info"]) == 0:
+                mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
+                annotation.append({"mask": mask, "label": None, "score": 0.0})
+            else:
+                for segment in outputs["segments_info"]:
+                    mask = (segmentation == segment["id"]) * 255
+                    mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+                    label = self.model.config.id2label[segment["label_id"]]
+                    score = segment["score"]
+                    annotation.append({"score": score, "label": label, "mask": mask})
 
-                # Remove empty masks.
-                if mask_sum == 0:
-                    continue
-                mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L")
-                # Semantic segmentation does not output a global score for the mask
-                # so we don't attempt to compute one.
-                # XXX: We could send a mask with values between 0 and 255 instead
-                # of a pure mask to enable users to get the probabilities that
-                # are really outputted by the logits.
+        elif task == "semantic" and hasattr(self.feature_extractor, "post_process_semantic_segmentation"):
+            outputs = self.feature_extractor.post_process_semantic_segmentation(
+                model_outputs, target_sizes=model_outputs["target_size"]
+            )[0]
+
+            annotation = []
+            segmentation = outputs.numpy()
+            labels = np.unique(segmentation)
+
+            for label in labels:
+                mask = (segmentation == label) * 255
+                mask = Image.fromarray(mask, mode="L")
+                label = self.model.config.id2label[label]
                 annotation.append({"score": None, "label": label, "mask": mask})
+        else:
+            raise ValueError(f"task {task} is not supported for model {self.model}")
         return annotation
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 3841bc1ab7..65656939d0 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -74,9 +74,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
     }
 
     def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        # Fix me Alara
-        if model.__class__.__name__ in ["DetrForSegmentation", "MaskFormerForInstanceSegmentation"]:
-            return None, None
         image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -150,7 +147,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         pass
 
     @require_torch
-    @unittest.skip("Fix me Alara!")
+    @unittest.skip("No weights found for hf-internal-testing/tiny-detr-mobilenetsv3-panoptic")
     def test_small_model_pt(self):
         model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
 
@@ -158,9 +155,15 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
         image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
 
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            task="panoptic",
+            threshold=0.0,
+            overlap_mask_area_threshold=0.0,
+        )
+
+        # Shortening by hashing
         for o in outputs:
-            # shortening by hashing
             o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
@@ -235,12 +238,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                 {
                     "score": None,
                     "label": "LABEL_0",
-                    "mask": "6225140faf502d272af076222776d7e4",
+                    "mask": "775518a7ed09eea888752176c6ba8f38",
                 },
                 {
                     "score": None,
                     "label": "LABEL_1",
-                    "mask": "8297c9f8eb43ddd3f32a6dae21e015a1",
+                    "mask": "a12da23a46848128af68c63aa8ba7a02",
                 },
             ],
         )
@@ -249,22 +252,28 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
     @slow
     def test_integration_torch_image_segmentation(self):
         model_id = "facebook/detr-resnet-50-panoptic"
-
         image_segmenter = pipeline("image-segmentation", model=model_id)
 
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            task="panoptic",
+            threshold=0,
+            overlap_mask_area_threshold=0.0,
+        )
+
+        # Shortening by hashing
         for o in outputs:
             o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
-                {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
-                {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
-                {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
-                {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
-                {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
-                {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
+                {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
+                {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
+                {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
+                {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
+                {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
             ],
         )
 
@@ -273,8 +282,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
             ],
+            task="panoptic",
             threshold=0.0,
+            overlap_mask_area_threshold=0.0,
         )
+
+        # Shortening by hashing
         for output in outputs:
             for o in output:
                 o["mask"] = hashimage(o["mask"])
@@ -283,20 +296,20 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
             nested_simplify(outputs, decimals=4),
             [
                 [
-                    {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
-                    {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
-                    {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
-                    {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
-                    {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
-                    {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                    {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
+                    {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
+                    {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
+                    {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
+                    {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
+                    {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
                 ],
                 [
-                    {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
-                    {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
-                    {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
-                    {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
-                    {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
-                    {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                    {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
+                    {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
+                    {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
+                    {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
+                    {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
+                    {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
                 ],
             ],
         )
@@ -304,12 +317,27 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
     @require_torch
     @slow
     def test_threshold(self):
-        threshold = 0.999
         model_id = "facebook/detr-resnet-50-panoptic"
-
         image_segmenter = pipeline("image-segmentation", model=model_id)
 
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.999
+        )
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = hashimage(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9995, "label": "remote", "mask": "d02404f5789f075e3b3174adbc3fd5b8"},
+                {"score": 0.9994, "label": "cat", "mask": "eaa115b40c96d3a6f4fe498963a7e470"},
+            ],
+        )
+
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.5
+        )
 
         for o in outputs:
             o["mask"] = hashimage(o["mask"])
@@ -317,8 +345,11 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
-                {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
-                {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
+                {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
+                {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
+                {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
+                {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
             ],
         )
 
@@ -335,20 +366,21 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
 
         image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         file = image[0]["file"]
-        outputs = image_segmenter(file, threshold=threshold)
+        outputs = image_segmenter(file, task="panoptic", threshold=threshold)
 
+        # Shortening by hashing
         for o in outputs:
             o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
-                {"mask": "20d1b9480d1dc1501dbdcfdff483e370", "label": "wall", "score": None},
-                {"mask": "0f902fbc66a0ff711ea455b0e4943adf", "label": "house", "score": None},
-                {"mask": "4537bdc07d47d84b3f8634b7ada37bd4", "label": "grass", "score": None},
-                {"mask": "b7ac77dfae44a904b479a0926a2acaf7", "label": "tree", "score": None},
-                {"mask": "e9bedd56bd40650fb263ce03eb621079", "label": "plant", "score": None},
-                {"mask": "37a609f8c9c1b8db91fbff269f428b20", "label": "road, route", "score": None},
-                {"mask": "0d8cdfd63bae8bf6e4344d460a2fa711", "label": "sky", "score": None},
+                {"score": 0.9974, "label": "wall", "mask": "a547b7c062917f4f3e36501827ad3cd6"},
+                {"score": 0.949, "label": "house", "mask": "0da9b7b38feac47bd2528a63e5ea7b19"},
+                {"score": 0.9995, "label": "grass", "mask": "1d07ea0a263dcf38ca8ae1a15fdceda1"},
+                {"score": 0.9976, "label": "tree", "mask": "6cdc97c7daf1dc596fa181f461ddd2ba"},
+                {"score": 0.8239, "label": "plant", "mask": "1ab4ce378f6ceff57d428055cfbd742f"},
+                {"score": 0.9942, "label": "road, route", "mask": "39c5d17be53b2d1b0f46aad8ebb15813"},
+                {"score": 1.0, "label": "sky", "mask": "a3756324a692981510c39b1a59510a36"},
             ],
         )