From f4e4ad34ccee6f011be1b21c28e78d4816601059 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 9 Mar 2022 10:19:05 +0100
Subject: [PATCH] Add `ForInstanceSegmentation` models to `image-segmentation`
 pipelines (#15937)

* Adding ForInstanceSegmentation to pipelines.

* Last fix `category_id` renamed to `label_id`.

* Can't be none no more.

* No `is_thing_map` anymore.
---
 .../pipelines/image_segmentation.py           | 15 ++++++++----
 .../test_pipelines_image_segmentation.py      | 23 ++++++++++++-------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 2f4e6e09ab..923a99ae9c 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -18,6 +18,7 @@ if is_torch_available():
 
     from ..models.auto.modeling_auto import (
         MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
         MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
     )
 
@@ -32,10 +33,10 @@ Predictions = List[Prediction]
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ImageSegmentationPipeline(Pipeline):
     """
-    Image segmentation pipeline using any `AutoModelForImageSegmentation`. This pipeline predicts masks of objects and
+    Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
     their classes.
 
-    This image segmntation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
     `"image-segmentation"`.
 
     See the list of available models on
@@ -50,7 +51,11 @@ class ImageSegmentationPipeline(Pipeline):
 
         requires_backends(self, "vision")
         self.check_model_type(
-            dict(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items())
+            dict(
+                MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
+                + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
+                + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
+            )
         )
 
     def _sanitize_parameters(self, **kwargs):
@@ -112,14 +117,14 @@ class ImageSegmentationPipeline(Pipeline):
     def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
         if hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
             outputs = self.feature_extractor.post_process_panoptic_segmentation(
-                model_outputs, is_thing_map=self.model.config.id2label
+                model_outputs, object_mask_threshold=threshold
             )[0]
             annotation = []
             segmentation = outputs["segmentation"]
             for segment in outputs["segments"]:
                 mask = (segmentation == segment["id"]) * 255
                 mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
-                label = self.model.config.id2label[segment["category_id"]]
+                label = self.model.config.id2label[segment["label_id"]]
                 annotation.append({"mask": mask, "label": label, "score": None})
         elif hasattr(self.feature_extractor, "post_process_segmentation"):
             # Panoptic
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index ffc3ff8821..fe3ff1ee88 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -20,11 +20,14 @@ from datasets import load_dataset
 
 from transformers import (
     MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+    MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
     MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
     AutoFeatureExtractor,
     AutoModelForImageSegmentation,
+    AutoModelForInstanceSegmentation,
     DetrForSegmentation,
     ImageSegmentationPipeline,
+    MaskFormerForInstanceSegmentation,
     is_vision_available,
     pipeline,
 )
@@ -67,6 +70,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
             list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else []
         )
         + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+        + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     }
 
     def get_test_pipeline(self, model, tokenizer, feature_extractor):
@@ -80,7 +84,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
         self.assertIsInstance(outputs, list)
         n = len(outputs)
-        self.assertGreater(n, 1)
+        if isinstance(image_segmenter.model, (MaskFormerForInstanceSegmentation)):
+            # Instance segmentation (maskformer) have a slot for null class
+            # and can output nothing even with a low threshold
+            self.assertGreaterEqual(n, 0)
+        else:
+            self.assertGreaterEqual(n, 1)
         # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
         # to make it work
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
@@ -119,7 +128,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         ]
         outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size)
         self.assertEqual(len(batch), len(outputs))
-        self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0])
         self.assertEqual(len(outputs[0]), n)
         self.assertEqual(
             [
@@ -313,18 +321,17 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
     @require_torch
     @slow
     def test_maskformer(self):
-        threshold = 0.999
+        threshold = 0.8
         model_id = "facebook/maskformer-swin-base-ade"
 
-        from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation
-
-        model = MaskFormerForInstanceSegmentation.from_pretrained(model_id)
-        feature_extractor = MaskFormerFeatureExtractor.from_pretrained(model_id)
+        model = AutoModelForInstanceSegmentation.from_pretrained(model_id)
+        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
 
         image_segmenter = pipeline("image-segmentation", model=model, feature_extractor=feature_extractor)
 
         image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        outputs = image_segmenter(image[0]["file"], threshold=threshold)
+        file = image[0]["file"]
+        outputs = image_segmenter(file, threshold=threshold)
 
         for o in outputs:
             o["mask"] = hashimage(o["mask"])