From 7f99861218babf897c7d0d6051b43d65962671c0 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Fri, 16 Dec 2022 14:22:46 +0100 Subject: [PATCH] Add Universal Segmentation class + mapping (#20766) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add mapping * Add mapping to pipeline * Apply suggestions * Fix feature extractor tests * Use ForInstance, add model to universal mapping * More fixes * Remove model from deprecated objectsé Co-authored-by: Niels Rogge --- docs/source/en/model_doc/auto.mdx | 4 ++++ src/transformers/__init__.py | 4 ++++ src/transformers/models/auto/__init__.py | 4 ++++ src/transformers/models/auto/modeling_auto.py | 21 +++++++++++++++++++ .../pipelines/image_segmentation.py | 2 ++ src/transformers/utils/dummy_pt_objects.py | 10 +++++++++ utils/check_repo.py | 1 - 7 files changed, 45 insertions(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx index 7957f453a2..b39920151d 100644 --- a/docs/source/en/model_doc/auto.mdx +++ b/docs/source/en/model_doc/auto.mdx @@ -254,6 +254,10 @@ The following auto classes are available for the following computer vision tasks [[autodoc]] AutoModelForInstanceSegmentation +### AutoModelForUniversalSegmentation + +[[autodoc]] AutoModelForUniversalSegmentation + ### AutoModelForZeroShotObjectDetection [[autodoc]] AutoModelForZeroShotObjectDetection diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 13f7006342..f51cb5caf7 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -943,6 +943,7 @@ else: "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", @@ -974,6 +975,7 @@ else: "AutoModelForSpeechSeq2Seq", "AutoModelForTableQuestionAnswering", "AutoModelForTokenClassification", + "AutoModelForUniversalSegmentation", "AutoModelForVideoClassification", "AutoModelForVision2Seq", "AutoModelForVisualQuestionAnswering", @@ -4113,6 +4115,7 @@ if TYPE_CHECKING: MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, MODEL_FOR_VISION_2_SEQ_MAPPING, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, @@ -4144,6 +4147,7 @@ if TYPE_CHECKING: AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, + AutoModelForUniversalSegmentation, AutoModelForVideoClassification, AutoModelForVision2Seq, AutoModelForVisualQuestionAnswering, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index a6ee30366b..da8ceb8e7e 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -67,6 +67,7 @@ else: "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", @@ -97,6 +98,7 @@ else: "AutoModelForSpeechSeq2Seq", "AutoModelForTableQuestionAnswering", "AutoModelForTokenClassification", + "AutoModelForUniversalSegmentation", "AutoModelForVideoClassification", "AutoModelForVision2Seq", "AutoModelForVisualQuestionAnswering", @@ -222,6 +224,7 @@ if TYPE_CHECKING: MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, MODEL_FOR_VISION_2_SEQ_MAPPING, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, @@ -253,6 +256,7 @@ if TYPE_CHECKING: AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, + AutoModelForUniversalSegmentation, AutoModelForVideoClassification, AutoModelForVision2Seq, AutoModelForVisualQuestionAnswering, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b3f751c688..7fef8a21a0 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -434,6 +434,15 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict( MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict( [ # Model for Instance Segmentation mapping + # MaskFormerForInstanceSegmentation can be removed from this mapping in v5 + ("maskformer", "MaskFormerForInstanceSegmentation"), + ] +) + +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Universal Segmentation mapping + ("detr", "DetrForSegmentation"), ("maskformer", "MaskFormerForInstanceSegmentation"), ] ) @@ -892,6 +901,9 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping( MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES ) +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES +) MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES ) @@ -1083,6 +1095,15 @@ AutoModelForSemanticSegmentation = auto_class_update( ) +class AutoModelForUniversalSegmentation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING + + +AutoModelForUniversalSegmentation = auto_class_update( + AutoModelForUniversalSegmentation, head_doc="universal image segmentation" +) + + class AutoModelForInstanceSegmentation(_BaseAutoModelClass): _model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index 55b2217ccd..9fdb0dc331 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -16,6 +16,7 @@ if is_torch_available(): MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, ) @@ -75,6 +76,7 @@ class ImageSegmentationPipeline(Pipeline): MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() + + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items() ) ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 7ca088f551..e65176d09b 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -446,6 +446,9 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = None + + MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None @@ -639,6 +642,13 @@ class AutoModelForTokenClassification(metaclass=DummyObject): requires_backends(self, ["torch"]) +class AutoModelForUniversalSegmentation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class AutoModelForVideoClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/utils/check_repo.py b/utils/check_repo.py index a7eb713efb..da0d6df8b0 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -239,7 +239,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "VisualBertForMultipleChoice", "TFWav2Vec2ForCTC", "TFHubertForCTC", - "MaskFormerForInstanceSegmentation", "XCLIPVisionModel", "XCLIPTextModel", ]