From 5fd5990dce2c6c147c7a5424f0767e3eb2279986 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 26 Oct 2022 10:44:36 +0200
Subject: [PATCH] Factored out some code in the `image-segmentation` pipeline.
 (#19727)

* Factored out some code in the image-segmentation pipeline

Re-enable `small_model_pt`.

Re-enable `small_model_pt`.

Enabling the current test with the current values.

Debugging the values on the CI.

More logs ? Printing doesn't work ?

Using the CI values instead. Seems to be a Pillow sensitivity.

Added a test showcasing that models not supporting some tasks get a
clear error.

Factored out code.

Further factor out.

Fixup.

Bad rebase.

Put `panoptic` before `instance` as it should be a superset.

* Fixing tests.

* Adding subtasks tests

+ Fixes `instance` segmentation which was broken due to default and
non kwargs arguments.

* Fix bad replace.
---
 .../models/detr/feature_extraction_detr.py    |  27 ++---
 .../pipelines/image_segmentation.py           |  71 ++++--------
 .../test_pipelines_image_segmentation.py      | 107 ++++++++++++++----
 3 files changed, 121 insertions(+), 84 deletions(-)

diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
index 7d6b05a7dc..f13b3728f8 100644
--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -1275,12 +1275,13 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
             # Get segmentation map and segment information of batch item
             target_size = target_sizes[i] if target_sizes is not None else None
             segmentation, segments = compute_segments(
-                mask_probs_item,
-                pred_scores_item,
-                pred_labels_item,
-                mask_threshold,
-                overlap_mask_area_threshold,
-                target_size,
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=[],
+                target_size=target_size,
             )
 
             # Return segmentation map in run-length encoding (RLE) format
@@ -1366,13 +1367,13 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
             # Get segmentation map and segment information of batch item
             target_size = target_sizes[i] if target_sizes is not None else None
             segmentation, segments = compute_segments(
-                mask_probs_item,
-                pred_scores_item,
-                pred_labels_item,
-                mask_threshold,
-                overlap_mask_area_threshold,
-                label_ids_to_fuse,
-                target_size,
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
             )
 
             results.append({"segmentation": segmentation, "segments_info": segments})
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 877c42a883..babd27a540 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -56,14 +56,15 @@ class ImageSegmentationPipeline(Pipeline):
 
     def _sanitize_parameters(self, **kwargs):
         postprocess_kwargs = {}
-        if "task" in kwargs:
-            postprocess_kwargs["task"] = kwargs["task"]
+        if "subtask" in kwargs:
+            postprocess_kwargs["subtask"] = kwargs["subtask"]
         if "threshold" in kwargs:
             postprocess_kwargs["threshold"] = kwargs["threshold"]
         if "mask_threshold" in kwargs:
             postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+
         return {}, {}, postprocess_kwargs
 
     def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
@@ -80,9 +81,10 @@ class ImageSegmentationPipeline(Pipeline):
 
                 The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
-            subtask (`str`, defaults to `panoptic`):
+            subtask (`str`, *optional*):
                 Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
-                capabilities.
+                capabilities. If not set, the pipeline will attempt tp resolve in the following order:
+                  `panoptic`, `instance`, `semantic`.
             threshold (`float`, *optional*, defaults to 0.9):
                 Probability threshold to filter out predicted masks.
             mask_threshold (`float`, *optional*, defaults to 0.5):
@@ -104,7 +106,6 @@ class ImageSegmentationPipeline(Pipeline):
             - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
               "object" described by the label and the mask.
         """
-
         return super().__call__(images, **kwargs)
 
     def preprocess(self, image):
@@ -123,10 +124,15 @@ class ImageSegmentationPipeline(Pipeline):
     def postprocess(
         self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
     ):
-        if (subtask == "panoptic" or subtask is None) and hasattr(
-            self.feature_extractor, "post_process_panoptic_segmentation"
-        ):
-            outputs = self.feature_extractor.post_process_panoptic_segmentation(
+
+        fn = None
+        if subtask in {"panoptic", None} and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
+            fn = self.feature_extractor.post_process_panoptic_segmentation
+        elif subtask in {"instance", None} and hasattr(self.feature_extractor, "post_process_instance_segmentation"):
+            fn = self.feature_extractor.post_process_instance_segmentation
+
+        if fn is not None:
+            outputs = fn(
                 model_outputs,
                 threshold=threshold,
                 mask_threshold=mask_threshold,
@@ -137,45 +143,14 @@ class ImageSegmentationPipeline(Pipeline):
             annotation = []
             segmentation = outputs["segmentation"]
 
-            if len(outputs["segments_info"]) == 0:
-                mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
-                annotation.append({"mask": mask, "label": "NULL", "score": 0.0})
-            else:
-                for segment in outputs["segments_info"]:
-                    mask = (segmentation == segment["id"]) * 255
-                    mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
-                    label = self.model.config.id2label[segment["label_id"]]
-                    score = segment["score"]
-                    annotation.append({"score": score, "label": label, "mask": mask})
+            for segment in outputs["segments_info"]:
+                mask = (segmentation == segment["id"]) * 255
+                mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+                label = self.model.config.id2label[segment["label_id"]]
+                score = segment["score"]
+                annotation.append({"score": score, "label": label, "mask": mask})
 
-        elif (subtask == "instance" or subtask is None) and hasattr(
-            self.feature_extractor, "post_process_instance_segmentation"
-        ):
-            outputs = self.feature_extractor.post_process_instance_segmentation(
-                model_outputs,
-                threshold=threshold,
-                mask_threshold=mask_threshold,
-                overlap_mask_area_threshold=overlap_mask_area_threshold,
-                target_sizes=model_outputs["target_size"],
-            )[0]
-
-            annotation = []
-            segmentation = outputs["segmentation"]
-
-            if len(outputs["segments_info"]) == 0:
-                mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
-                annotation.append({"mask": mask, "label": "NULL", "score": 0.0})
-            else:
-                for segment in outputs["segments_info"]:
-                    mask = (segmentation == segment["id"]) * 255
-                    mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
-                    label = self.model.config.id2label[segment["label_id"]]
-                    score = segment["score"]
-                    annotation.append({"mask": mask, "label": label, "score": score})
-
-        elif (subtask == "semantic" or subtask is None) and hasattr(
-            self.feature_extractor, "post_process_semantic_segmentation"
-        ):
+        elif subtask in {"semantic", None} and hasattr(self.feature_extractor, "post_process_semantic_segmentation"):
             outputs = self.feature_extractor.post_process_semantic_segmentation(
                 model_outputs, target_sizes=model_outputs["target_size"]
             )[0]
@@ -190,5 +165,5 @@ class ImageSegmentationPipeline(Pipeline):
                 label = self.model.config.id2label[label]
                 annotation.append({"score": None, "label": label, "mask": mask})
         else:
-            raise ValueError(f"Task {subtask} is not supported for model {self.model}.s")
+            raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
         return annotation
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 92ae2e942b..9c73d6c9b4 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -89,8 +89,8 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         )
         self.assertIsInstance(outputs, list)
         n = len(outputs)
-        if isinstance(image_segmenter.model, (MaskFormerForInstanceSegmentation)):
-            # Instance segmentation (maskformer) have a slot for null class
+        if isinstance(image_segmenter.model, (MaskFormerForInstanceSegmentation, DetrForSegmentation)):
+            # Instance segmentation (maskformer, and detr) have a slot for null class
             # and can output nothing even with a low threshold
             self.assertGreaterEqual(n, 0)
         else:
@@ -153,26 +153,53 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
     def test_small_model_tf(self):
         pass
 
+    @require_torch
+    def test_small_model_pt_no_panoptic(self):
+        model_id = "hf-internal-testing/tiny-random-mobilevit"
+        # The default task is `image-classification` we need to override
+        pipe = pipeline(task="image-segmentation", model=model_id)
+
+        # This model does NOT support neither `instance` nor  `panoptic`
+        # We should error out
+        with self.assertRaises(ValueError) as e:
+            pipe("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="panoptic")
+        self.assertEqual(
+            str(e.exception),
+            "Subtask panoptic is not supported for model <class"
+            " 'transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation'>",
+        )
+        with self.assertRaises(ValueError) as e:
+            pipe("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="instance")
+        self.assertEqual(
+            str(e.exception),
+            "Subtask instance is not supported for model <class"
+            " 'transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation'>",
+        )
+
     @require_torch
     def test_small_model_pt(self):
         model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
 
         model = AutoModelForImageSegmentation.from_pretrained(model_id)
         feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
-        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
-
-        outputs = image_segmenter(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
+        image_segmenter = ImageSegmentationPipeline(
+            model=model,
+            feature_extractor=feature_extractor,
             subtask="panoptic",
             threshold=0.0,
             mask_threshold=0.0,
             overlap_mask_area_threshold=0.0,
         )
 
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+        )
+
         # Shortening by hashing
         for o in outputs:
             o["mask"] = mask_to_test_readable(o["mask"])
 
+        # This is extremely brittle, and those values are made specific for the CI.
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
@@ -189,9 +216,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
             ],
-            threshold=0.0,
-            mask_threshold=0.0,
-            overlap_mask_area_threshold=0.0,
         )
         for output in outputs:
             for o in output:
@@ -217,6 +241,48 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
             ],
         )
 
+        output = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="instance")
+        for o in output:
+            o["mask"] = mask_to_test_readable(o["mask"])
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {
+                    "score": 0.004,
+                    "label": "LABEL_215",
+                    "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
+                },
+            ],
+        )
+
+        # This must be surprising to the reader.
+        # The `panoptic` returns only LABEL_215, and this returns 3 labels.
+        #
+        output = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="semantic")
+        for o in output:
+            o["mask"] = mask_to_test_readable(o["mask"])
+        self.maxDiff = None
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {
+                    "label": "LABEL_88",
+                    "mask": {"hash": "7f0bf661a4", "shape": (480, 640), "white_pixels": 3},
+                    "score": None,
+                },
+                {
+                    "label": "LABEL_101",
+                    "mask": {"hash": "10ab738dc9", "shape": (480, 640), "white_pixels": 8948},
+                    "score": None,
+                },
+                {
+                    "label": "LABEL_215",
+                    "mask": {"hash": "b431e0946c", "shape": (480, 640), "white_pixels": 298249},
+                    "score": None,
+                },
+            ],
+        )
+
     @require_torch
     def test_small_model_pt_semantic(self):
         model_id = "hf-internal-testing/tiny-random-beit-pipeline"
@@ -246,13 +312,15 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
     @slow
     def test_integration_torch_image_segmentation(self):
         model_id = "facebook/detr-resnet-50-panoptic"
-        image_segmenter = pipeline("image-segmentation", model=model_id)
+        image_segmenter = pipeline(
+            "image-segmentation",
+            model=model_id,
+            threshold=0.0,
+            overlap_mask_area_threshold=0.0,
+        )
 
         outputs = image_segmenter(
             "http://images.cocodataset.org/val2017/000000039769.jpg",
-            subtask="panoptic",
-            threshold=0,
-            overlap_mask_area_threshold=0.0,
         )
 
         # Shortening by hashing
@@ -300,9 +368,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
             ],
-            subtask="panoptic",
-            threshold=0.0,
-            overlap_mask_area_threshold=0.0,
         )
 
         # Shortening by hashing
@@ -386,9 +451,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         model_id = "facebook/detr-resnet-50-panoptic"
         image_segmenter = pipeline("image-segmentation", model=model_id)
 
-        outputs = image_segmenter(
-            "http://images.cocodataset.org/val2017/000000039769.jpg", subtask="panoptic", threshold=0.999
-        )
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.999)
         # Shortening by hashing
         for o in outputs:
             o["mask"] = mask_to_test_readable(o["mask"])
@@ -409,9 +472,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
             ],
         )
 
-        outputs = image_segmenter(
-            "http://images.cocodataset.org/val2017/000000039769.jpg", subtask="panoptic", threshold=0.5
-        )
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.5)
 
         for o in outputs:
             o["mask"] = mask_to_test_readable(o["mask"])
@@ -460,7 +521,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
 
         image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         file = image[0]["file"]
-        outputs = image_segmenter(file, subtask="panoptic", threshold=threshold)
+        outputs = image_segmenter(file, threshold=threshold)
 
         # Shortening by hashing
         for o in outputs: