From 983451a13e5378908866309647afc94761fa8572 Mon Sep 17 00:00:00 2001 From: Alara Dirik <8944735+alaradirik@users.noreply.github.com> Date: Fri, 7 Oct 2022 23:34:41 +0300 Subject: [PATCH] Improve and fix ImageSegmentationPipeline (#19367) - Fixes the image segmentation pipeline test failures caused by changes to the postprocessing methods of supported models - Updates the ImageSegmentationPipeline tests - Improves docs, adds 'task' argument to optionally perform semantic, instance or panoptic segmentation --- .../pipelines/image_segmentation.py | 138 +++++++++--------- .../test_pipelines_image_segmentation.py | 114 +++++++++------ 2 files changed, 143 insertions(+), 109 deletions(-) diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index a33095cfc2..690247f6e4 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -12,9 +12,6 @@ if is_vision_available(): from ..image_utils import load_image if is_torch_available(): - import torch - from torch import nn - from ..models.auto.modeling_auto import ( MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, @@ -59,13 +56,15 @@ class ImageSegmentationPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): postprocess_kwargs = {} + if "task" in kwargs: + postprocess_kwargs["task"] = kwargs["task"] if "threshold" in kwargs: postprocess_kwargs["threshold"] = kwargs["threshold"] - if "mask_threshold" in kwargs: - postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"] + if "overlap_mask_area_threshold" in kwargs: + postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"] return {}, {}, postprocess_kwargs - def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]: + def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]: """ Perform segmentation (detect masks & classes) in the image(s) passed as inputs. @@ -79,30 +78,34 @@ class ImageSegmentationPipeline(Pipeline): The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the same format: all as HTTP(S) links, all as local paths, or all as PIL images. + task (`str`, defaults to `semantic`): + Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model + capabilities. threshold (`float`, *optional*, defaults to 0.9): - The probability necessary to make a prediction. - mask_threshold (`float`, *optional*, defaults to 0.5): - Threshold to use when turning the predicted masks into binary values. + Probability threshold to filter out predicted masks. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5): + Mask overlap threshold to eliminate small, disconnected segments. Return: A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries corresponding to each image. - The dictionaries contain the following keys: + The dictionaries contain the mask, label and score (where applicable) of each detected object and contains + the following keys: - **label** (`str`) -- The class label identified by the model. - - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the - image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is. + - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of + the original image. Returns a mask filled with zeros if no object is found. - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the "object" described by the label and the mask. """ - return super().__call__(*args, **kwargs) + return super().__call__(images, **kwargs) def preprocess(self, image): image = load_image(image) - target_size = torch.IntTensor([[image.height, image.width]]) + target_size = [(image.height, image.width)] inputs = self.feature_extractor(images=[image], return_tensors="pt") inputs["target_size"] = target_size return inputs @@ -113,66 +116,65 @@ class ImageSegmentationPipeline(Pipeline): model_outputs["target_size"] = target_size return model_outputs - def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5): - if hasattr(self.feature_extractor, "post_process_panoptic_segmentation"): + def postprocess(self, model_outputs, task="semantic", threshold=0.9, overlap_mask_area_threshold=0.5): + if task == "instance" and hasattr(self.feature_extractor, "post_process_instance_segmentation"): outputs = self.feature_extractor.post_process_panoptic_segmentation( - model_outputs, object_mask_threshold=threshold + model_outputs, + threshold=threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_sizes=model_outputs["target_size"], )[0] + annotation = [] segmentation = outputs["segmentation"] - for segment in outputs["segments"]: - mask = (segmentation == segment["id"]) * 255 - mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L") - label = self.model.config.id2label[segment["label_id"]] - annotation.append({"mask": mask, "label": label, "score": None}) - elif hasattr(self.feature_extractor, "post_process_segmentation"): - # Panoptic - raw_annotations = self.feature_extractor.post_process_segmentation( - model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5 - ) - raw_annotation = raw_annotations[0] - raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels - raw_annotation["scores"] = raw_annotation["scores"].tolist() - raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]] - raw_annotation["masks"] = [ - Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"] - ] - # {"scores": [...], ...} --> [{"score":x, ...}, ...] - keys = ["score", "label", "mask"] - annotation = [ - dict(zip(keys, vals)) - for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"]) - ] - else: - # Default logits - logits = model_outputs.logits - logits = logits.softmax(dim=1) - if len(logits.shape) != 4: - raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}") - batch_size, num_labels, height, width = logits.shape - expected_num_labels = len(self.model.config.id2label) - if num_labels != expected_num_labels: - raise ValueError( - f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}" - ) - size = model_outputs["target_size"].squeeze(0).tolist() - logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False) - classes = logits_reshaped.argmax(dim=1)[0] + + if len(outputs["segments_info"]) == 0: + mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L") + annotation.append({"mask": mask, "label": None, "score": 0.0}) + else: + for segment in outputs["segments_info"]: + mask = (segmentation == segment["id"]) * 255 + mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L") + label = self.model.config.id2label[segment["label_id"]] + score = segment["score"] + annotation.append({"mask": mask, "label": label, "score": score}) + + elif task == "panoptic" and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"): + outputs = self.feature_extractor.post_process_panoptic_segmentation( + model_outputs, + threshold=threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_sizes=model_outputs["target_size"], + )[0] + annotation = [] + segmentation = outputs["segmentation"] - for label_id in range(num_labels): - label = self.model.config.id2label[label_id] - mask = classes == label_id - mask_sum = mask.sum() + if len(outputs["segments_info"]) == 0: + mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L") + annotation.append({"mask": mask, "label": None, "score": 0.0}) + else: + for segment in outputs["segments_info"]: + mask = (segmentation == segment["id"]) * 255 + mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L") + label = self.model.config.id2label[segment["label_id"]] + score = segment["score"] + annotation.append({"score": score, "label": label, "mask": mask}) - # Remove empty masks. - if mask_sum == 0: - continue - mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L") - # Semantic segmentation does not output a global score for the mask - # so we don't attempt to compute one. - # XXX: We could send a mask with values between 0 and 255 instead - # of a pure mask to enable users to get the probabilities that - # are really outputted by the logits. + elif task == "semantic" and hasattr(self.feature_extractor, "post_process_semantic_segmentation"): + outputs = self.feature_extractor.post_process_semantic_segmentation( + model_outputs, target_sizes=model_outputs["target_size"] + )[0] + + annotation = [] + segmentation = outputs.numpy() + labels = np.unique(segmentation) + + for label in labels: + mask = (segmentation == label) * 255 + mask = Image.fromarray(mask, mode="L") + label = self.model.config.id2label[label] annotation.append({"score": None, "label": label, "mask": mask}) + else: + raise ValueError(f"task {task} is not supported for model {self.model}") return annotation diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py index 3841bc1ab7..65656939d0 100644 --- a/tests/pipelines/test_pipelines_image_segmentation.py +++ b/tests/pipelines/test_pipelines_image_segmentation.py @@ -74,9 +74,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa } def get_test_pipeline(self, model, tokenizer, feature_extractor): - # Fix me Alara - if model.__class__.__name__ in ["DetrForSegmentation", "MaskFormerForInstanceSegmentation"]: - return None, None image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor) return image_segmenter, [ "./tests/fixtures/tests_samples/COCO/000000039769.png", @@ -150,7 +147,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa pass @require_torch - @unittest.skip("Fix me Alara!") + @unittest.skip("No weights found for hf-internal-testing/tiny-detr-mobilenetsv3-panoptic") def test_small_model_pt(self): model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic" @@ -158,9 +155,15 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor) - outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0) + outputs = image_segmenter( + "http://images.cocodataset.org/val2017/000000039769.jpg", + task="panoptic", + threshold=0.0, + overlap_mask_area_threshold=0.0, + ) + + # Shortening by hashing for o in outputs: - # shortening by hashing o["mask"] = hashimage(o["mask"]) self.assertEqual( @@ -235,12 +238,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa { "score": None, "label": "LABEL_0", - "mask": "6225140faf502d272af076222776d7e4", + "mask": "775518a7ed09eea888752176c6ba8f38", }, { "score": None, "label": "LABEL_1", - "mask": "8297c9f8eb43ddd3f32a6dae21e015a1", + "mask": "a12da23a46848128af68c63aa8ba7a02", }, ], ) @@ -249,22 +252,28 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa @slow def test_integration_torch_image_segmentation(self): model_id = "facebook/detr-resnet-50-panoptic" - image_segmenter = pipeline("image-segmentation", model=model_id) - outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg") + outputs = image_segmenter( + "http://images.cocodataset.org/val2017/000000039769.jpg", + task="panoptic", + threshold=0, + overlap_mask_area_threshold=0.0, + ) + + # Shortening by hashing for o in outputs: o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), [ - {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"}, - {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"}, - {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"}, - {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"}, - {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"}, - {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"}, + {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"}, + {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"}, + {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"}, + {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"}, + {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"}, + {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"}, ], ) @@ -273,8 +282,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa "http://images.cocodataset.org/val2017/000000039769.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg", ], + task="panoptic", threshold=0.0, + overlap_mask_area_threshold=0.0, ) + + # Shortening by hashing for output in outputs: for o in output: o["mask"] = hashimage(o["mask"]) @@ -283,20 +296,20 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa nested_simplify(outputs, decimals=4), [ [ - {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"}, - {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"}, - {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"}, - {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"}, - {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"}, - {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"}, + {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"}, + {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"}, + {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"}, + {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"}, + {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"}, + {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"}, ], [ - {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"}, - {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"}, - {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"}, - {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"}, - {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"}, - {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"}, + {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"}, + {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"}, + {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"}, + {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"}, + {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"}, + {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"}, ], ], ) @@ -304,12 +317,27 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa @require_torch @slow def test_threshold(self): - threshold = 0.999 model_id = "facebook/detr-resnet-50-panoptic" - image_segmenter = pipeline("image-segmentation", model=model_id) - outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold) + outputs = image_segmenter( + "http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.999 + ) + # Shortening by hashing + for o in outputs: + o["mask"] = hashimage(o["mask"]) + + self.assertEqual( + nested_simplify(outputs, decimals=4), + [ + {"score": 0.9995, "label": "remote", "mask": "d02404f5789f075e3b3174adbc3fd5b8"}, + {"score": 0.9994, "label": "cat", "mask": "eaa115b40c96d3a6f4fe498963a7e470"}, + ], + ) + + outputs = image_segmenter( + "http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.5 + ) for o in outputs: o["mask"] = hashimage(o["mask"]) @@ -317,8 +345,11 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa self.assertEqual( nested_simplify(outputs, decimals=4), [ - {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"}, - {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"}, + {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"}, + {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"}, + {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"}, + {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"}, + {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"}, ], ) @@ -335,20 +366,21 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") file = image[0]["file"] - outputs = image_segmenter(file, threshold=threshold) + outputs = image_segmenter(file, task="panoptic", threshold=threshold) + # Shortening by hashing for o in outputs: o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), [ - {"mask": "20d1b9480d1dc1501dbdcfdff483e370", "label": "wall", "score": None}, - {"mask": "0f902fbc66a0ff711ea455b0e4943adf", "label": "house", "score": None}, - {"mask": "4537bdc07d47d84b3f8634b7ada37bd4", "label": "grass", "score": None}, - {"mask": "b7ac77dfae44a904b479a0926a2acaf7", "label": "tree", "score": None}, - {"mask": "e9bedd56bd40650fb263ce03eb621079", "label": "plant", "score": None}, - {"mask": "37a609f8c9c1b8db91fbff269f428b20", "label": "road, route", "score": None}, - {"mask": "0d8cdfd63bae8bf6e4344d460a2fa711", "label": "sky", "score": None}, + {"score": 0.9974, "label": "wall", "mask": "a547b7c062917f4f3e36501827ad3cd6"}, + {"score": 0.949, "label": "house", "mask": "0da9b7b38feac47bd2528a63e5ea7b19"}, + {"score": 0.9995, "label": "grass", "mask": "1d07ea0a263dcf38ca8ae1a15fdceda1"}, + {"score": 0.9976, "label": "tree", "mask": "6cdc97c7daf1dc596fa181f461ddd2ba"}, + {"score": 0.8239, "label": "plant", "mask": "1ab4ce378f6ceff57d428055cfbd742f"}, + {"score": 0.9942, "label": "road, route", "mask": "39c5d17be53b2d1b0f46aad8ebb15813"}, + {"score": 1.0, "label": "sky", "mask": "a3756324a692981510c39b1a59510a36"}, ], )