Add Universal Segmentation class + mapping (#20766)
* Add mapping * Add mapping to pipeline * Apply suggestions * Fix feature extractor tests * Use ForInstance, add model to universal mapping * More fixes * Remove model from deprecated objectsé Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
This commit is contained in:
@@ -254,6 +254,10 @@ The following auto classes are available for the following computer vision tasks
|
||||
|
||||
[[autodoc]] AutoModelForInstanceSegmentation
|
||||
|
||||
### AutoModelForUniversalSegmentation
|
||||
|
||||
[[autodoc]] AutoModelForUniversalSegmentation
|
||||
|
||||
### AutoModelForZeroShotObjectDetection
|
||||
|
||||
[[autodoc]] AutoModelForZeroShotObjectDetection
|
||||
|
||||
@@ -943,6 +943,7 @@ else:
|
||||
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
||||
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
||||
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
||||
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
||||
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
||||
@@ -974,6 +975,7 @@ else:
|
||||
"AutoModelForSpeechSeq2Seq",
|
||||
"AutoModelForTableQuestionAnswering",
|
||||
"AutoModelForTokenClassification",
|
||||
"AutoModelForUniversalSegmentation",
|
||||
"AutoModelForVideoClassification",
|
||||
"AutoModelForVision2Seq",
|
||||
"AutoModelForVisualQuestionAnswering",
|
||||
@@ -4113,6 +4115,7 @@ if TYPE_CHECKING:
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
||||
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
|
||||
@@ -4144,6 +4147,7 @@ if TYPE_CHECKING:
|
||||
AutoModelForSpeechSeq2Seq,
|
||||
AutoModelForTableQuestionAnswering,
|
||||
AutoModelForTokenClassification,
|
||||
AutoModelForUniversalSegmentation,
|
||||
AutoModelForVideoClassification,
|
||||
AutoModelForVision2Seq,
|
||||
AutoModelForVisualQuestionAnswering,
|
||||
|
||||
@@ -67,6 +67,7 @@ else:
|
||||
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
||||
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
||||
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
||||
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
||||
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
||||
@@ -97,6 +98,7 @@ else:
|
||||
"AutoModelForSpeechSeq2Seq",
|
||||
"AutoModelForTableQuestionAnswering",
|
||||
"AutoModelForTokenClassification",
|
||||
"AutoModelForUniversalSegmentation",
|
||||
"AutoModelForVideoClassification",
|
||||
"AutoModelForVision2Seq",
|
||||
"AutoModelForVisualQuestionAnswering",
|
||||
@@ -222,6 +224,7 @@ if TYPE_CHECKING:
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
||||
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
|
||||
@@ -253,6 +256,7 @@ if TYPE_CHECKING:
|
||||
AutoModelForSpeechSeq2Seq,
|
||||
AutoModelForTableQuestionAnswering,
|
||||
AutoModelForTokenClassification,
|
||||
AutoModelForUniversalSegmentation,
|
||||
AutoModelForVideoClassification,
|
||||
AutoModelForVision2Seq,
|
||||
AutoModelForVisualQuestionAnswering,
|
||||
|
||||
@@ -434,6 +434,15 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Instance Segmentation mapping
|
||||
# MaskFormerForInstanceSegmentation can be removed from this mapping in v5
|
||||
("maskformer", "MaskFormerForInstanceSegmentation"),
|
||||
]
|
||||
)
|
||||
|
||||
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Universal Segmentation mapping
|
||||
("detr", "DetrForSegmentation"),
|
||||
("maskformer", "MaskFormerForInstanceSegmentation"),
|
||||
]
|
||||
)
|
||||
@@ -892,6 +901,9 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
|
||||
)
|
||||
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES
|
||||
)
|
||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
|
||||
)
|
||||
@@ -1083,6 +1095,15 @@ AutoModelForSemanticSegmentation = auto_class_update(
|
||||
)
|
||||
|
||||
|
||||
class AutoModelForUniversalSegmentation(_BaseAutoModelClass):
|
||||
_model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING
|
||||
|
||||
|
||||
AutoModelForUniversalSegmentation = auto_class_update(
|
||||
AutoModelForUniversalSegmentation, head_doc="universal image segmentation"
|
||||
)
|
||||
|
||||
|
||||
class AutoModelForInstanceSegmentation(_BaseAutoModelClass):
|
||||
_model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ if is_torch_available():
|
||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
|
||||
)
|
||||
|
||||
|
||||
@@ -75,6 +76,7 @@ class ImageSegmentationPipeline(Pipeline):
|
||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
|
||||
+ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
|
||||
+ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
|
||||
+ MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items()
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -446,6 +446,9 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
|
||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
|
||||
|
||||
|
||||
@@ -639,6 +642,13 @@ class AutoModelForTokenClassification(metaclass=DummyObject):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class AutoModelForUniversalSegmentation(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class AutoModelForVideoClassification(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -239,7 +239,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
||||
"VisualBertForMultipleChoice",
|
||||
"TFWav2Vec2ForCTC",
|
||||
"TFHubertForCTC",
|
||||
"MaskFormerForInstanceSegmentation",
|
||||
"XCLIPVisionModel",
|
||||
"XCLIPTextModel",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user