diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 88c67226bc..f427c4be7b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1460,6 +1460,7 @@ else: "MODEL_FOR_DEPTH_ESTIMATION_MAPPING", "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "MODEL_FOR_IMAGE_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", @@ -6203,6 +6204,7 @@ if TYPE_CHECKING: MODEL_FOR_DEPTH_ESTIMATION_MAPPING, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 153f7f10de..3db995a9c7 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -49,6 +49,7 @@ else: "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_DEPTH_ESTIMATION_MAPPING", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "MODEL_FOR_IMAGE_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", @@ -233,6 +234,7 @@ if TYPE_CHECKING: MODEL_FOR_DEPTH_ESTIMATION_MAPPING, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1fc959119d..50534c58e8 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -29,7 +29,6 @@ from .configuration_auto import CONFIG_MAPPING_NAMES logger = logging.get_logger(__name__) - MODEL_MAPPING_NAMES = OrderedDict( [ # Base model mapping @@ -478,6 +477,58 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ] ) +MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict( + [ + # Model for Image mapping + ("beit", "BeitModel"), + ("bit", "BitModel"), + ("conditional_detr", "ConditionalDetrModel"), + ("convnext", "ConvNextModel"), + ("convnextv2", "ConvNextV2Model"), + ("data2vec-vision", "Data2VecVisionModel"), + ("deformable_detr", "DeformableDetrModel"), + ("deit", "DeiTModel"), + ("deta", "DetaModel"), + ("detr", "DetrModel"), + ("dinat", "DinatModel"), + ("dinov2", "Dinov2Model"), + ("dpt", "DPTModel"), + ("efficientformer", "EfficientFormerModel"), + ("efficientnet", "EfficientNetModel"), + ("focalnet", "FocalNetModel"), + ("glpn", "GLPNModel"), + ("imagegpt", "ImageGPTModel"), + ("levit", "LevitModel"), + ("mobilenet_v1", "MobileNetV1Model"), + ("mobilenet_v2", "MobileNetV2Model"), + ("mobilevit", "MobileViTModel"), + ("mobilevitv2", "MobileViTV2Model"), + ("nat", "NatModel"), + ("poolformer", "PoolFormerModel"), + ("pvt", "PvtModel"), + ("regnet", "RegNetModel"), + ("resnet", "ResNetModel"), + ("segformer", "SegformerModel"), + ("siglip_vision_model", "SiglipVisionModel"), + ("swiftformer", "SwiftFormerModel"), + ("swin", "SwinModel"), + ("swin2sr", "Swin2SRModel"), + ("swinv2", "Swinv2Model"), + ("table-transformer", "TableTransformerModel"), + ("timesformer", "TimesformerModel"), + ("timm_backbone", "TimmBackbone"), + ("van", "VanModel"), + ("videomae", "VideoMAEModel"), + ("vit", "ViTModel"), + ("vit_hybrid", "ViTHybridModel"), + ("vit_mae", "ViTMAEModel"), + ("vit_msn", "ViTMSNModel"), + ("vitdet", "VitDetModel"), + ("vivit", "VivitModel"), + ("yolos", "YolosModel"), + ] +) + MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict( [ ("deit", "DeiTForMaskedImageModeling"), @@ -1243,6 +1294,7 @@ MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES ) MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES) +MODEL_FOR_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_MAPPING_NAMES) MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES ) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a2436dadc1..1b70db000c 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -63,7 +63,10 @@ from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, i from .integrations.tpu import tpu_spmd_dataloader from .modelcard import TrainingSummary from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model -from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES +from .models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_MAPPING_NAMES, +) from .optimization import Adafactor, get_scheduler from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13 from .tokenization_utils_base import PreTrainedTokenizerBase diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index de22b2d36f..dd2e50c67d 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -598,6 +598,9 @@ MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None +MODEL_FOR_IMAGE_MAPPING = None + + MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 9f5c36a18a..be726b8541 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -39,6 +39,7 @@ from ..models.auto.modeling_auto import ( MODEL_FOR_CTC_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_IMAGE_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES, @@ -95,6 +96,7 @@ def _generate_supported_model_class_names( "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, "backbone": MODEL_FOR_BACKBONE_MAPPING_NAMES, + "image-feature-extraction": MODEL_FOR_IMAGE_MAPPING_NAMES, } if supported_tasks is None: diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 32f6abcbe3..a2a16a1400 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -700,7 +700,10 @@ class ModelTesterMixin: for model_class in self.all_model_classes: if ( model_class.__name__ - in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] + in [ + *get_values(MODEL_MAPPING_NAMES), + *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES), + ] or not model_class.supports_gradient_checkpointing ): continue diff --git a/utils/check_repo.py b/utils/check_repo.py index aa448f32e6..ca25d7d9e3 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -732,6 +732,8 @@ def check_all_auto_object_names_being_defined(): # module, if it's a private model defined in this file. if name.endswith("MODEL_MAPPING_NAMES") and is_a_private_model(class_name): continue + if name.endswith("MODEL_FOR_IMAGE_MAPPING_NAMES") and is_a_private_model(class_name): + continue failures.append( f"`{class_name}` appears in the mapping `{name}` but it is not defined in the library." ) diff --git a/utils/update_metadata.py b/utils/update_metadata.py old mode 100644 new mode 100755 index 2104d53b6e..0762c4c2aa --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -62,6 +62,7 @@ _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGenerati PIPELINE_TAGS_AND_AUTO_MODELS = [ ("pretraining", "MODEL_FOR_PRETRAINING_MAPPING_NAMES", "AutoModelForPreTraining"), ("feature-extraction", "MODEL_MAPPING_NAMES", "AutoModel"), + ("image-feature-extraction", "MODEL_FOR_IMAGE_MAPPING_NAMES", "AutoModel"), ("audio-classification", "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES", "AutoModelForAudioClassification"), ("text-generation", "MODEL_FOR_CAUSAL_LM_MAPPING_NAMES", "AutoModelForCausalLM"), ("automatic-speech-recognition", "MODEL_FOR_CTC_MAPPING_NAMES", "AutoModelForCTC"),