Enable image-segmentation on AutoModelForSemanticSegmentation (#15647)

* Enabling Beit SegFormer to `image-segmentation`. * Fixing the score. * Fix import ? * Missing in type hint. * Multiple test fixes: - Add `raw_image` support. It should be the default IMHO since in Python world it doesn't make any sense to base64 encode the image (Sorry @mishig, didn't catch that in my review). I really think we should consider breaking BC here. - Add support for Segformer tiny test (needed `SegformerModelTester.get_config` to enable TinyConfig @NielsRogge) - Add the check that `batch_size` works correctly on that pipeline. Uncovered that it doesn't for Detr, which IMO is OK since images after `feature_extractor` don't have the same size. Comment should explain. * Type hint as a string. * Make fixup + update black. * torch+vision protections. * Don't use torchvision, use F.interpolate instead (no new dep). * Last fixes for Segformer. * Update test to reflect new image (which was broken) * Update tests. * Major BC modification: - Removed the string compressed PNG string, that's a job for users `transformers` stays in python land. - Removed the `score` for semantic segmentation. It has hardly a meaning on its own in this context. - Don't include the grayscale with logits for now (which could enable users to get a sense of confidence). Might be done later. - Don't include the surface of the mask (could be used for sorting by users, to filter out small masks). It's already calculable, and it's easier to add later, than to add now and break later if we need. * `make fixup`. * Small changes. * Rebase + doc fixup.
2022-02-23 17:20:26 +01:00
parent 1b23979736
commit 9e71d46455
7 changed files with 176 additions and 83 deletions
--- a/tests/test_modeling_segformer.py
+++ b/tests/test_modeling_segformer.py
@@ -101,7 +101,11 @@ class SegformerModelTester:
        if self.use_labels:
            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)

-        config = SegformerConfig(
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return SegformerConfig(
            image_size=self.image_size,
            num_channels=self.num_channels,
            num_encoder_blocks=self.num_encoder_blocks,
@@ -114,8 +118,6 @@ class SegformerModelTester:
            initializer_range=self.initializer_range,
        )

-        return config, pixel_values, labels
-
    def create_and_check_model(self, config, pixel_values, labels):
        model = SegformerModel(config=config)
        model.to(torch_device)
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -126,14 +126,14 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_


 class ANY:
-    def __init__(self, _type):
-        self._type = _type
+    def __init__(self, *_types):
+        self._types = _types

    def __eq__(self, other):
-        return isinstance(other, self._type)
+        return isinstance(other, self._types)

    def __repr__(self):
-        return f"ANY({self._type.__name__})"
+        return f"ANY({', '.join(_type.__name__ for _type in self._types)})"


 class PipelineTestCaseMeta(type):
--- a/tests/test_pipelines_image_segmentation.py
+++ b/tests/test_pipelines_image_segmentation.py
@@ -15,10 +15,14 @@
 import hashlib
 import unittest

+import datasets
+
 from transformers import (
    MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
    AutoFeatureExtractor,
    AutoModelForImageSegmentation,
+    DetrForSegmentation,
    ImageSegmentationPipeline,
    is_vision_available,
    pipeline,
@@ -46,12 +50,23 @@ else:
            pass


+def hashimage(image: Image) -> str:
+    m = hashlib.md5(image.tobytes())
+    return m.hexdigest()
+
+
@require_vision
@require_timm
@require_torch
@is_pipeline_test
 class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
+    model_mapping = {
+        k: v
+        for k, v in (
+            list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else []
+        )
+        + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+    }

    def get_test_pipeline(self, model, tokenizer, feature_extractor):
        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
@@ -62,34 +77,59 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa

    def run_pipeline_test(self, image_segmenter, examples):
        outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
-        self.assertEqual(outputs, [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12)
-
-        import datasets
+        self.assertIsInstance(outputs, list)
+        n = len(outputs)
+        self.assertGreater(n, 1)
+        # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
+        # to make it work
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)

        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")

-        batch = [
-            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-            # RGBA
-            dataset[0]["file"],
-            # LA
-            dataset[1]["file"],
-            # L
-            dataset[2]["file"],
-        ]
-        outputs = image_segmenter(batch, threshold=0.0)
+        # RGBA
+        outputs = image_segmenter(dataset[0]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # LA
+        outputs = image_segmenter(dataset[1]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # L
+        outputs = image_segmenter(dataset[2]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)

+        if isinstance(image_segmenter.model, DetrForSegmentation):
+            # We need to test batch_size with images with the same size.
+            # Detr doesn't normalize the size of the images, meaning we can have
+            # 800x800 or 800x1200, meaning we cannot batch simply.
+            # We simply bail on this
+            batch_size = 1
+        else:
+            batch_size = 2
+
+        # 5 times the same image so the output shape is predictable
+        batch = [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+        outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size)
        self.assertEqual(len(batch), len(outputs))
+        self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0])
+        self.assertEqual(len(outputs[0]), n)
        self.assertEqual(
-            outputs,
            [
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
            ],
+            outputs,
+            f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
        )

    @require_tf
@@ -108,7 +148,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
        for o in outputs:
            # shortening by hashing
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),
@@ -116,12 +156,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                {
                    "score": 0.004,
                    "label": "LABEL_0",
-                    "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                    "mask": "34eecd16bbfb0f476083ef947d81bf66",
                },
                {
                    "score": 0.004,
                    "label": "LABEL_0",
-                    "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                    "mask": "34eecd16bbfb0f476083ef947d81bf66",
                },
            ],
        )
@@ -135,7 +175,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        )
        for output in outputs:
            for o in output:
-                o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+                o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),
@@ -144,29 +184,54 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                ],
                [
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                ],
            ],
        )

+    @require_torch
+    def test_small_model_pt_semantic(self):
+        model_id = "hf-internal-testing/tiny-random-beit-pipeline"
+        image_segmenter = pipeline(model=model_id)
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
+        for o in outputs:
+            # shortening by hashing
+            o["mask"] = hashimage(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": None,
+                    "label": "LABEL_0",
+                    "mask": "01245d8ad25d03f09493ca97965788ae",
+                },
+                {
+                    "score": None,
+                    "label": "LABEL_1",
+                    "mask": "f741516de8d5196a2c830739b9ac1c8c",
+                },
+            ],
+        )
+
    @require_torch
    @slow
    def test_integration_torch_image_segmentation(self):
@@ -176,7 +241,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa

        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
        for o in outputs:
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),
@@ -234,7 +299,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)

        for o in outputs:
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),