Add Universal Segmentation class + mapping (#20766)
* Add mapping * Add mapping to pipeline * Apply suggestions * Fix feature extractor tests * Use ForInstance, add model to universal mapping * More fixes * Remove model from deprecated objectsé Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
This commit is contained in:
@@ -254,6 +254,10 @@ The following auto classes are available for the following computer vision tasks
|
|||||||
|
|
||||||
[[autodoc]] AutoModelForInstanceSegmentation
|
[[autodoc]] AutoModelForInstanceSegmentation
|
||||||
|
|
||||||
|
### AutoModelForUniversalSegmentation
|
||||||
|
|
||||||
|
[[autodoc]] AutoModelForUniversalSegmentation
|
||||||
|
|
||||||
### AutoModelForZeroShotObjectDetection
|
### AutoModelForZeroShotObjectDetection
|
||||||
|
|
||||||
[[autodoc]] AutoModelForZeroShotObjectDetection
|
[[autodoc]] AutoModelForZeroShotObjectDetection
|
||||||
|
|||||||
@@ -943,6 +943,7 @@ else:
|
|||||||
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
||||||
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
||||||
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
||||||
|
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
||||||
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
||||||
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
||||||
@@ -974,6 +975,7 @@ else:
|
|||||||
"AutoModelForSpeechSeq2Seq",
|
"AutoModelForSpeechSeq2Seq",
|
||||||
"AutoModelForTableQuestionAnswering",
|
"AutoModelForTableQuestionAnswering",
|
||||||
"AutoModelForTokenClassification",
|
"AutoModelForTokenClassification",
|
||||||
|
"AutoModelForUniversalSegmentation",
|
||||||
"AutoModelForVideoClassification",
|
"AutoModelForVideoClassification",
|
||||||
"AutoModelForVision2Seq",
|
"AutoModelForVision2Seq",
|
||||||
"AutoModelForVisualQuestionAnswering",
|
"AutoModelForVisualQuestionAnswering",
|
||||||
@@ -4113,6 +4115,7 @@ if TYPE_CHECKING:
|
|||||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||||
|
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
|
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
|
||||||
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
||||||
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
|
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
|
||||||
@@ -4144,6 +4147,7 @@ if TYPE_CHECKING:
|
|||||||
AutoModelForSpeechSeq2Seq,
|
AutoModelForSpeechSeq2Seq,
|
||||||
AutoModelForTableQuestionAnswering,
|
AutoModelForTableQuestionAnswering,
|
||||||
AutoModelForTokenClassification,
|
AutoModelForTokenClassification,
|
||||||
|
AutoModelForUniversalSegmentation,
|
||||||
AutoModelForVideoClassification,
|
AutoModelForVideoClassification,
|
||||||
AutoModelForVision2Seq,
|
AutoModelForVision2Seq,
|
||||||
AutoModelForVisualQuestionAnswering,
|
AutoModelForVisualQuestionAnswering,
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ else:
|
|||||||
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
||||||
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
||||||
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
||||||
|
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
||||||
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
||||||
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
||||||
@@ -97,6 +98,7 @@ else:
|
|||||||
"AutoModelForSpeechSeq2Seq",
|
"AutoModelForSpeechSeq2Seq",
|
||||||
"AutoModelForTableQuestionAnswering",
|
"AutoModelForTableQuestionAnswering",
|
||||||
"AutoModelForTokenClassification",
|
"AutoModelForTokenClassification",
|
||||||
|
"AutoModelForUniversalSegmentation",
|
||||||
"AutoModelForVideoClassification",
|
"AutoModelForVideoClassification",
|
||||||
"AutoModelForVision2Seq",
|
"AutoModelForVision2Seq",
|
||||||
"AutoModelForVisualQuestionAnswering",
|
"AutoModelForVisualQuestionAnswering",
|
||||||
@@ -222,6 +224,7 @@ if TYPE_CHECKING:
|
|||||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||||
|
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
|
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
|
||||||
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
||||||
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
|
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
|
||||||
@@ -253,6 +256,7 @@ if TYPE_CHECKING:
|
|||||||
AutoModelForSpeechSeq2Seq,
|
AutoModelForSpeechSeq2Seq,
|
||||||
AutoModelForTableQuestionAnswering,
|
AutoModelForTableQuestionAnswering,
|
||||||
AutoModelForTokenClassification,
|
AutoModelForTokenClassification,
|
||||||
|
AutoModelForUniversalSegmentation,
|
||||||
AutoModelForVideoClassification,
|
AutoModelForVideoClassification,
|
||||||
AutoModelForVision2Seq,
|
AutoModelForVision2Seq,
|
||||||
AutoModelForVisualQuestionAnswering,
|
AutoModelForVisualQuestionAnswering,
|
||||||
|
|||||||
@@ -434,6 +434,15 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
|||||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
||||||
[
|
[
|
||||||
# Model for Instance Segmentation mapping
|
# Model for Instance Segmentation mapping
|
||||||
|
# MaskFormerForInstanceSegmentation can be removed from this mapping in v5
|
||||||
|
("maskformer", "MaskFormerForInstanceSegmentation"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
||||||
|
[
|
||||||
|
# Model for Universal Segmentation mapping
|
||||||
|
("detr", "DetrForSegmentation"),
|
||||||
("maskformer", "MaskFormerForInstanceSegmentation"),
|
("maskformer", "MaskFormerForInstanceSegmentation"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -892,6 +901,9 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
|||||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
||||||
CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
|
CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
|
||||||
)
|
)
|
||||||
|
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping(
|
||||||
|
CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES
|
||||||
|
)
|
||||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
|
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
|
||||||
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
|
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
|
||||||
)
|
)
|
||||||
@@ -1083,6 +1095,15 @@ AutoModelForSemanticSegmentation = auto_class_update(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelForUniversalSegmentation(_BaseAutoModelClass):
|
||||||
|
_model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING
|
||||||
|
|
||||||
|
|
||||||
|
AutoModelForUniversalSegmentation = auto_class_update(
|
||||||
|
AutoModelForUniversalSegmentation, head_doc="universal image segmentation"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForInstanceSegmentation(_BaseAutoModelClass):
|
class AutoModelForInstanceSegmentation(_BaseAutoModelClass):
|
||||||
_model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING
|
_model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ if is_torch_available():
|
|||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
|
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
|
||||||
|
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -75,6 +76,7 @@ class ImageSegmentationPipeline(Pipeline):
|
|||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
|
||||||
+ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
|
+ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
|
||||||
+ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
|
+ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
|
||||||
|
+ MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -446,6 +446,9 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
|
|||||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
|
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
|
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
@@ -639,6 +642,13 @@ class AutoModelForTokenClassification(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["torch"])
|
requires_backends(self, ["torch"])
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelForUniversalSegmentation(metaclass=DummyObject):
|
||||||
|
_backends = ["torch"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["torch"])
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForVideoClassification(metaclass=DummyObject):
|
class AutoModelForVideoClassification(metaclass=DummyObject):
|
||||||
_backends = ["torch"]
|
_backends = ["torch"]
|
||||||
|
|
||||||
|
|||||||
@@ -239,7 +239,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
|||||||
"VisualBertForMultipleChoice",
|
"VisualBertForMultipleChoice",
|
||||||
"TFWav2Vec2ForCTC",
|
"TFWav2Vec2ForCTC",
|
||||||
"TFHubertForCTC",
|
"TFHubertForCTC",
|
||||||
"MaskFormerForInstanceSegmentation",
|
|
||||||
"XCLIPVisionModel",
|
"XCLIPVisionModel",
|
||||||
"XCLIPTextModel",
|
"XCLIPTextModel",
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user