From 7f99861218babf897c7d0d6051b43d65962671c0 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Fri, 16 Dec 2022 14:22:46 +0100
Subject: [PATCH] Add Universal Segmentation class + mapping (#20766)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add mapping

* Add mapping to pipeline

* Apply suggestions

* Fix feature extractor tests

* Use ForInstance, add model to universal mapping

* More fixes

* Remove model from deprecated objectsé

Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
---
 docs/source/en/model_doc/auto.mdx             |  4 ++++
 src/transformers/__init__.py                  |  4 ++++
 src/transformers/models/auto/__init__.py      |  4 ++++
 src/transformers/models/auto/modeling_auto.py | 21 +++++++++++++++++++
 .../pipelines/image_segmentation.py           |  2 ++
 src/transformers/utils/dummy_pt_objects.py    | 10 +++++++++
 utils/check_repo.py                           |  1 -
 7 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx
index 7957f453a2..b39920151d 100644
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -254,6 +254,10 @@ The following auto classes are available for the following computer vision tasks
 
 [[autodoc]] AutoModelForInstanceSegmentation
 
+### AutoModelForUniversalSegmentation
+
+[[autodoc]] AutoModelForUniversalSegmentation
+
 ### AutoModelForZeroShotObjectDetection
 
 [[autodoc]] AutoModelForZeroShotObjectDetection
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 13f7006342..f51cb5caf7 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -943,6 +943,7 @@ else:
             "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
             "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
             "MODEL_FOR_VISION_2_SEQ_MAPPING",
             "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
@@ -974,6 +975,7 @@ else:
             "AutoModelForSpeechSeq2Seq",
             "AutoModelForTableQuestionAnswering",
             "AutoModelForTokenClassification",
+            "AutoModelForUniversalSegmentation",
             "AutoModelForVideoClassification",
             "AutoModelForVision2Seq",
             "AutoModelForVisualQuestionAnswering",
@@ -4113,6 +4115,7 @@ if TYPE_CHECKING:
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
@@ -4144,6 +4147,7 @@ if TYPE_CHECKING:
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
+            AutoModelForUniversalSegmentation,
             AutoModelForVideoClassification,
             AutoModelForVision2Seq,
             AutoModelForVisualQuestionAnswering,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index a6ee30366b..da8ceb8e7e 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -67,6 +67,7 @@ else:
         "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
         "MODEL_FOR_VISION_2_SEQ_MAPPING",
         "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
@@ -97,6 +98,7 @@ else:
         "AutoModelForSpeechSeq2Seq",
         "AutoModelForTableQuestionAnswering",
         "AutoModelForTokenClassification",
+        "AutoModelForUniversalSegmentation",
         "AutoModelForVideoClassification",
         "AutoModelForVision2Seq",
         "AutoModelForVisualQuestionAnswering",
@@ -222,6 +224,7 @@ if TYPE_CHECKING:
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
@@ -253,6 +256,7 @@ if TYPE_CHECKING:
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
+            AutoModelForUniversalSegmentation,
             AutoModelForVideoClassification,
             AutoModelForVision2Seq,
             AutoModelForVisualQuestionAnswering,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b3f751c688..7fef8a21a0 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -434,6 +434,15 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Instance Segmentation mapping
+        # MaskFormerForInstanceSegmentation can be removed from this mapping in v5
+        ("maskformer", "MaskFormerForInstanceSegmentation"),
+    ]
+)
+
+MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Universal Segmentation mapping
+        ("detr", "DetrForSegmentation"),
         ("maskformer", "MaskFormerForInstanceSegmentation"),
     ]
 )
@@ -892,6 +901,9 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
 MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
 )
+MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES
+)
 MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
@@ -1083,6 +1095,15 @@ AutoModelForSemanticSegmentation = auto_class_update(
 )
 
 
+class AutoModelForUniversalSegmentation(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING
+
+
+AutoModelForUniversalSegmentation = auto_class_update(
+    AutoModelForUniversalSegmentation, head_doc="universal image segmentation"
+)
+
+
 class AutoModelForInstanceSegmentation(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING
 
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 55b2217ccd..9fdb0dc331 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -16,6 +16,7 @@ if is_torch_available():
         MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
         MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
         MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
     )
 
 
@@ -75,6 +76,7 @@ class ImageSegmentationPipeline(Pipeline):
                 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
                 + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
                 + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
+                + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items()
             )
         )
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 7ca088f551..e65176d09b 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -446,6 +446,9 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
+MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = None
+
+
 MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
 
 
@@ -639,6 +642,13 @@ class AutoModelForTokenClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForUniversalSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForVideoClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index a7eb713efb..da0d6df8b0 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -239,7 +239,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     "VisualBertForMultipleChoice",
     "TFWav2Vec2ForCTC",
     "TFHubertForCTC",
-    "MaskFormerForInstanceSegmentation",
     "XCLIPVisionModel",
     "XCLIPTextModel",
 ]