🚨🚨🚨 [eomt] make EoMT compatible with pipeline (#39122)

* Make EoMT compatible with pipeline * Implicit patch offsets * remove patch offsets from arg * Modify tests * Update example * fix proc testcase * Add few more args * add pipeline test suite * fix * docstring fixes * add pipeline test * changes w.r.t review * 🙈 MB * should fix device mismatch * debug * Fixes device mismatch * use decorator * we can split mlp * expected values update --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
2025-07-02 16:55:26 +05:30
parent 4d5822e65d
commit b61023a1b7
7 changed files with 113 additions and 92 deletions
--- a/tests/models/eomt/test_image_processing_eomt.py
+++ b/tests/models/eomt/test_image_processing_eomt.py
@@ -84,10 +84,11 @@ class EomtImageProcessingTester:
            "num_labels": self.num_labels,
        }

-    def prepare_fake_eomt_outputs(self, batch_size):
+    def prepare_fake_eomt_outputs(self, batch_size, patch_offsets=None):
        return EomtForUniversalSegmentationOutput(
            masks_queries_logits=torch.randn((batch_size, self.num_queries, self.height, self.width)),
            class_queries_logits=torch.randn((batch_size, self.num_queries, self.num_classes + 1)),
+            patch_offsets=patch_offsets,
        )

    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
@@ -263,13 +264,13 @@ class EomtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)

        inputs = processor(images=image, do_split_image=True, return_tensors="pt")
-        patch_offsets = inputs.pop("patch_offsets")
+        patch_offsets = inputs["patch_offsets"]

-        original_sizes = [image.size[::-1]]
+        target_sizes = [image.size[::-1]]

        # For semantic segmentation, the BS of output is 2 coz, two patches are created for the image.
-        outputs = self.image_processor_tester.prepare_fake_eomt_outputs(inputs["pixel_values"].shape[0])
-        segmentation = processor.post_process_semantic_segmentation(outputs, patch_offsets, original_sizes)
+        outputs = self.image_processor_tester.prepare_fake_eomt_outputs(inputs["pixel_values"].shape[0], patch_offsets)
+        segmentation = processor.post_process_semantic_segmentation(outputs, target_sizes)

        self.assertEqual(segmentation[0].shape, (image.height, image.width))

--- a/tests/models/eomt/test_modeling_eomt.py
+++ b/tests/models/eomt/test_modeling_eomt.py
@@ -17,12 +17,13 @@ import unittest

 import requests

-from transformers import AutoImageProcessor, EomtConfig, EomtForUniversalSegmentation
+from transformers import AutoImageProcessor, EomtConfig, EomtForUniversalSegmentation, pipeline
 from transformers.testing_utils import require_torch, require_torch_accelerator, require_torch_fp16, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin


 if is_torch_available():
@@ -100,8 +101,9 @@ class EomtForUniversalSegmentationTester:


@require_torch
-class EomtForUniversalSegmentationTest(ModelTesterMixin, unittest.TestCase):
+class EomtForUniversalSegmentationTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (EomtForUniversalSegmentation,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-segmentation": EomtForUniversalSegmentation} if is_torch_available() else {}
    is_encoder_decoder = False
    test_pruning = False
    test_head_masking = False
@@ -340,7 +342,6 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)

        inputs = processor(images=image, return_tensors="pt").to(model.device)
-        patch_offsets = inputs.pop("patch_offsets", None)

        with torch.inference_mode():
            outputs = model(**inputs)
@@ -348,11 +349,9 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
        self.assertTrue(outputs.class_queries_logits.shape == (2, 100, 151))
        self.assertTrue(outputs.masks_queries_logits.shape == (2, 100, 128, 128))

-        preds = processor.post_process_semantic_segmentation(
-            outputs, original_image_sizes=[(image.size[1], image.size[0])], patch_offsets=patch_offsets
-        )
+        preds = processor.post_process_semantic_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])[0]

-        self.assertTrue(preds.shape[1:] == (image.size[1], image.size[0]))
+        self.assertTrue(preds.shape == (image.size[1], image.size[0]))

        # fmt: off
        EXPECTED_SLICE = torch.tensor([
@@ -369,7 +368,7 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
        ], device=model.device)
        # fmt: on

-        output_slice = preds[0, :10, :10]
+        output_slice = preds[:10, :10]
        torch.testing.assert_close(output_slice, EXPECTED_SLICE, rtol=1e-2, atol=1e-2)

    @slow
@@ -387,9 +386,7 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
        self.assertTrue(outputs.class_queries_logits.shape == (1, 200, 134))
        self.assertTrue(outputs.masks_queries_logits.shape == (1, 200, 160, 160))

-        preds = processor.post_process_panoptic_segmentation(
-            outputs, original_image_sizes=[(image.size[1], image.size[0])]
-        )[0]
+        preds = processor.post_process_panoptic_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])[0]
        segmentation, segments_info = preds["segmentation"], preds["segments_info"]

        # fmt: off
@@ -438,9 +435,7 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
        self.assertTrue(outputs.class_queries_logits.shape == (1, 200, 81))
        self.assertTrue(outputs.masks_queries_logits.shape == (1, 200, 160, 160))

-        preds = processor.post_process_instance_segmentation(
-            outputs, original_image_sizes=[(image.size[1], image.size[0])]
-        )[0]
+        preds = processor.post_process_instance_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])[0]
        segmentation, segments_info = preds["segmentation"], preds["segments_info"]

        # fmt: off
@@ -473,3 +468,15 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
            self.assertEqual(actual["id"], expected["id"])
            self.assertEqual(actual["label_id"], expected["label_id"])
            self.assertAlmostEqual(actual["score"], expected["score"], delta=1e-3)
+
+    @slow
+    def test_segmentation_pipeline(self):
+        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        pipe = pipeline(model=self.model_id, subtask="panoptic", device=torch_device)
+        output = pipe(image)
+
+        EXPECTED_OUTPUT_LABELS = ["cat", "cat", "couch", "remote", "remote"]
+
+        output_labels = [segment["label"] for segment in output]
+        self.assertEqual(output_labels, EXPECTED_OUTPUT_LABELS)