diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index 92dbb006f6..216e4f1d90 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -602,10 +602,6 @@ class _BaseAutoBackboneClass(_BaseAutoModelClass): config = kwargs.pop("config", TimmBackboneConfig()) - use_timm = kwargs.pop("use_timm_backbone", True) - if not use_timm: - raise ValueError("`use_timm_backbone` must be `True` for timm backbones") - if kwargs.get("out_features", None) is not None: raise ValueError("Cannot specify `out_features` for timm backbones") @@ -627,7 +623,8 @@ class _BaseAutoBackboneClass(_BaseAutoModelClass): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - if kwargs.get("use_timm_backbone", False): + use_timm_backbone = kwargs.pop("use_timm_backbone", False) + if use_timm_backbone: return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py index 5f9e49db6c..a5cc3d5303 100644 --- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py +++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py @@ -93,11 +93,11 @@ class ConditionalDetrConfig(PretrainedConfig): position_embedding_type (`str`, *optional*, defaults to `"sine"`): Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. backbone (`str`, *optional*, defaults to `"resnet50"`): - Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional - backbone from the timm package. For a list of all available models, see [this - page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `True`): - Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + Whether to use pretrained weights for the backbone. dilation (`bool`, *optional*, defaults to `False`): Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`. @@ -180,6 +180,14 @@ class ConditionalDetrConfig(PretrainedConfig): focal_alpha=0.25, **kwargs, ): + if not use_timm_backbone and use_pretrained_backbone: + raise ValueError( + "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`" + ) + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index a6161061d9..e9a4cde2df 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -85,11 +85,11 @@ class DeformableDetrConfig(PretrainedConfig): position_embedding_type (`str`, *optional*, defaults to `"sine"`): Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. backbone (`str`, *optional*, defaults to `"resnet50"`): - Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional - backbone from the timm package. For a list of all available models, see [this - page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `True`): - Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + Whether to use pretrained weights for the backbone. dilation (`bool`, *optional*, defaults to `False`): Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`. @@ -196,6 +196,14 @@ class DeformableDetrConfig(PretrainedConfig): disable_custom_kernels=False, **kwargs, ): + if not use_timm_backbone and use_pretrained_backbone: + raise ValueError( + "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`" + ) + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py index 8a89a6ddc0..1ade9465a9 100644 --- a/src/transformers/models/deta/configuration_deta.py +++ b/src/transformers/models/deta/configuration_deta.py @@ -40,6 +40,12 @@ class DetaConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `False`): + Whether to use pretrained weights for the backbone. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead. @@ -138,6 +144,8 @@ class DetaConfig(PretrainedConfig): def __init__( self, backbone_config=None, + backbone=None, + use_pretrained_backbone=False, num_queries=900, max_position_embeddings=2048, encoder_layers=6, @@ -177,7 +185,13 @@ class DetaConfig(PretrainedConfig): focal_alpha=0.25, **kwargs, ): - if backbone_config is None: + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"]) else: @@ -187,6 +201,8 @@ class DetaConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.num_queries = num_queries self.max_position_embeddings = max_position_embeddings self.d_model = d_model diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index fadd9ce087..acaf0dfe1e 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -93,11 +93,11 @@ class DetrConfig(PretrainedConfig): position_embedding_type (`str`, *optional*, defaults to `"sine"`): Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. backbone (`str`, *optional*, defaults to `"resnet50"`): - Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional - backbone from the timm package. For a list of all available models, see [this - page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). - use_pretrained_backbone (`bool`, *optional*, defaults to `True`): - Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `True`): + Whether to use pretrained weights for the backbone. dilation (`bool`, *optional*, defaults to `False`): Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`. @@ -177,6 +177,14 @@ class DetrConfig(PretrainedConfig): eos_coefficient=0.1, **kwargs, ): + if not use_timm_backbone and use_pretrained_backbone: + raise ValueError( + "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`" + ) + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index e668bb7f02..0b6366659b 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -111,6 +111,12 @@ class DPTConfig(PretrainedConfig): backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to leverage the [`AutoBackbone`] API. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. Example: @@ -161,6 +167,8 @@ class DPTConfig(PretrainedConfig): backbone_featmap_shape=[1, 1024, 24, 24], neck_ignore_stages=[0, 1], backbone_config=None, + backbone=None, + use_pretrained_backbone=False, **kwargs, ): super().__init__(**kwargs) @@ -168,9 +176,15 @@ class DPTConfig(PretrainedConfig): self.hidden_size = hidden_size self.is_hybrid = is_hybrid + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + use_autobackbone = False if self.is_hybrid: - if backbone_config is None: + if backbone_config is None and backbone is None: logger.info("Initializing the config with a `BiT` backbone.") backbone_config = { "global_padding": "same", @@ -213,6 +227,8 @@ class DPTConfig(PretrainedConfig): self.backbone_featmap_shape = None self.neck_ignore_stages = [] + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.num_hidden_layers = None if use_autobackbone else num_hidden_layers self.num_attention_heads = None if use_autobackbone else num_attention_heads self.intermediate_size = None if use_autobackbone else intermediate_size diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py index a7ca3dbc50..7202e551a0 100644 --- a/src/transformers/models/mask2former/configuration_mask2former.py +++ b/src/transformers/models/mask2former/configuration_mask2former.py @@ -47,6 +47,12 @@ class Mask2FormerConfig(PretrainedConfig): backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `SwinConfig()`): The configuration of the backbone model. If unset, the configuration corresponding to `swin-base-patch4-window12-384` will be used. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `False`): + Whether to use pretrained weights for the backbone. feature_size (`int`, *optional*, defaults to 256): The features (channels) of the resulting feature maps. mask_feature_size (`int`, *optional*, defaults to 256): @@ -154,9 +160,17 @@ class Mask2FormerConfig(PretrainedConfig): use_auxiliary_loss: bool = True, feature_strides: List[int] = [4, 8, 16, 32], output_auxiliary_logits: bool = None, + backbone=None, + use_pretrained_backbone=False, **kwargs, ): - if backbone_config is None: + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.") backbone_config = CONFIG_MAPPING["swin"]( image_size=224, @@ -177,7 +191,7 @@ class Mask2FormerConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) # verify that the backbone is supported - if backbone_config.model_type not in self.backbones_supported: + if backbone_config is not None and backbone_config.model_type not in self.backbones_supported: logger.warning_once( f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. " f"Supported model types: {','.join(self.backbones_supported)}" @@ -212,6 +226,8 @@ class Mask2FormerConfig(PretrainedConfig): self.feature_strides = feature_strides self.output_auxiliary_logits = output_auxiliary_logits self.num_hidden_layers = decoder_layers + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone super().__init__(**kwargs) diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py index f3d83a0bbf..3d2814dbfd 100644 --- a/src/transformers/models/maskformer/configuration_maskformer.py +++ b/src/transformers/models/maskformer/configuration_maskformer.py @@ -57,6 +57,12 @@ class MaskFormerConfig(PretrainedConfig): backbone_config (`Dict`, *optional*): The configuration passed to the backbone, if unset, the configuration corresponding to `swin-base-patch4-window12-384` will be used. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `False`): + Whether to use pretrained weights for the backbone. decoder_config (`Dict`, *optional*): The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50` will be used. @@ -114,9 +120,17 @@ class MaskFormerConfig(PretrainedConfig): cross_entropy_weight: float = 1.0, mask_weight: float = 20.0, output_auxiliary_logits: Optional[bool] = None, + backbone: Optional[str] = None, + use_pretrained_backbone: bool = False, **kwargs, ): - if backbone_config is None: + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: # fall back to https://huggingface.co/microsoft/swin-base-patch4-window12-384-in22k backbone_config = SwinConfig( image_size=384, @@ -136,7 +150,7 @@ class MaskFormerConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) # verify that the backbone is supported - if backbone_config.model_type not in self.backbones_supported: + if backbone_config is not None and backbone_config.model_type not in self.backbones_supported: logger.warning_once( f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with MaskFormer. " f"Supported model types: {','.join(self.backbones_supported)}" @@ -177,6 +191,8 @@ class MaskFormerConfig(PretrainedConfig): self.num_attention_heads = self.decoder_config.encoder_attention_heads self.num_hidden_layers = self.decoder_config.num_hidden_layers + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone super().__init__(**kwargs) @classmethod diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py index 672f1dcba8..6cf54947de 100644 --- a/src/transformers/models/oneformer/configuration_oneformer.py +++ b/src/transformers/models/oneformer/configuration_oneformer.py @@ -44,6 +44,12 @@ class OneFormerConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig`, *optional*, defaults to `SwinConfig`): The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. ignore_value (`int`, *optional*, defaults to 255): Values to be ignored in GT label while calculating loss. num_queries (`int`, *optional*, defaults to 150): @@ -144,6 +150,8 @@ class OneFormerConfig(PretrainedConfig): def __init__( self, backbone_config: Optional[Dict] = None, + backbone: Optional[str] = None, + use_pretrained_backbone: bool = False, ignore_value: int = 255, num_queries: int = 150, no_object_weight: int = 0.1, @@ -186,7 +194,13 @@ class OneFormerConfig(PretrainedConfig): common_stride: int = 4, **kwargs, ): - if backbone_config is None: + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is unset. Initializing the config with the default `Swin` backbone.") backbone_config = CONFIG_MAPPING["swin"]( image_size=224, @@ -206,7 +220,8 @@ class OneFormerConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) self.backbone_config = backbone_config - + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.ignore_value = ignore_value self.num_queries = num_queries self.no_object_weight = no_object_weight diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index d79734b383..5a97ce05b3 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -92,12 +92,12 @@ class TableTransformerConfig(PretrainedConfig): Whether auxiliary decoding losses (loss at each decoder layer) are to be used. position_embedding_type (`str`, *optional*, defaults to `"sine"`): Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. - backbone (`str`, *optional*, defaults to `"resnet50"`): - Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional - backbone from the timm package. For a list of all available models, see [this - page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). - use_pretrained_backbone (`bool`, *optional*, defaults to `True`): - Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `True`): + Whether to use pretrained weights for the backbone. dilation (`bool`, *optional*, defaults to `False`): Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`. @@ -178,6 +178,14 @@ class TableTransformerConfig(PretrainedConfig): eos_coefficient=0.1, **kwargs, ): + if not use_timm_backbone and use_pretrained_backbone: + raise ValueError( + "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`" + ) + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index dfb0a5f998..954ee4e90c 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -43,6 +43,12 @@ class TvpConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*): The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. distance_loss_weight (`float`, *optional*, defaults to 1.0): The weight of distance loss. duration_loss_weight (`float`, *optional*, defaults to 0.1): @@ -95,6 +101,8 @@ class TvpConfig(PretrainedConfig): def __init__( self, backbone_config=None, + backbone=None, + use_pretrained_backbone=False, distance_loss_weight=1.0, duration_loss_weight=0.1, visual_prompter_type="framepad", @@ -118,8 +126,13 @@ class TvpConfig(PretrainedConfig): **kwargs, ): super().__init__(**kwargs) + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") - if backbone_config is None: + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) elif isinstance(backbone_config, dict): @@ -128,6 +141,8 @@ class TvpConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.distance_loss_weight = distance_loss_weight self.duration_loss_weight = duration_loss_weight self.visual_prompter_type = visual_prompter_type diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py index ba4afad10f..c4e6f8168f 100644 --- a/src/transformers/models/upernet/configuration_upernet.py +++ b/src/transformers/models/upernet/configuration_upernet.py @@ -36,6 +36,12 @@ class UperNetConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `False`): + Whether to use pretrained weights for the backbone. hidden_size (`int`, *optional*, defaults to 512): The number of hidden units in the convolutional layers. initializer_range (`float`, *optional*, defaults to 0.02): @@ -75,6 +81,8 @@ class UperNetConfig(PretrainedConfig): def __init__( self, backbone_config=None, + backbone=None, + use_pretrained_backbone=False, hidden_size=512, initializer_range=0.02, pool_scales=[1, 2, 3, 6], @@ -88,8 +96,13 @@ class UperNetConfig(PretrainedConfig): **kwargs, ): super().__init__(**kwargs) + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") - if backbone_config is None: + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"]) elif isinstance(backbone_config, dict): @@ -98,6 +111,8 @@ class UperNetConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.hidden_size = hidden_size self.initializer_range = initializer_range self.pool_scales = pool_scales diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py index 0b8a0da75f..b0a37617dc 100644 --- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py @@ -42,6 +42,12 @@ class ViTHybridConfig(PretrainedConfig): Args: backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): The configuration of the backbone in a dictionary or the config object of the backbone. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -92,6 +98,8 @@ class ViTHybridConfig(PretrainedConfig): def __init__( self, backbone_config=None, + backbone=None, + use_pretrained_backbone=False, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, @@ -109,8 +117,13 @@ class ViTHybridConfig(PretrainedConfig): **kwargs, ): super().__init__(**kwargs) + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") - if backbone_config is None: + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.") backbone_config = { "global_padding": "same", @@ -132,6 +145,8 @@ class ViTHybridConfig(PretrainedConfig): self.backbone_featmap_shape = backbone_featmap_shape self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py index 562abbe5e5..608b606c9b 100644 --- a/src/transformers/models/vitmatte/configuration_vitmatte.py +++ b/src/transformers/models/vitmatte/configuration_vitmatte.py @@ -42,6 +42,12 @@ class VitMatteConfig(PretrainedConfig): Args: backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitDetConfig()`): The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. hidden_size (`int`, *optional*, defaults to 384): The number of input channels of the decoder. batch_norm_eps (`float`, *optional*, defaults to 1e-05): @@ -73,6 +79,8 @@ class VitMatteConfig(PretrainedConfig): def __init__( self, backbone_config: PretrainedConfig = None, + backbone=None, + use_pretrained_backbone=False, hidden_size: int = 384, batch_norm_eps: float = 1e-5, initializer_range: float = 0.02, @@ -82,7 +90,13 @@ class VitMatteConfig(PretrainedConfig): ): super().__init__(**kwargs) - if backbone_config is None: + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.") backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"]) elif isinstance(backbone_config, dict): @@ -91,6 +105,8 @@ class VitMatteConfig(PretrainedConfig): backbone_config = config_class.from_dict(backbone_config) self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone self.batch_norm_eps = batch_norm_eps self.hidden_size = hidden_size self.initializer_range = initializer_range diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py index 3dfccbb77a..22c35c3f9b 100644 --- a/src/transformers/utils/backbone_utils.py +++ b/src/transformers/utils/backbone_utils.py @@ -286,3 +286,56 @@ class BackboneConfigMixin: output["out_features"] = output.pop("_out_features") output["out_indices"] = output.pop("_out_indices") return output + + +def load_backbone(config): + """ + Loads the backbone model from a config object. + + If the config is from the backbone model itself, then we return a backbone model with randomly initialized + weights. + + If the config is from the parent model of the backbone model itself, then we load the pretrained backbone weights + if specified. + """ + from transformers import AutoBackbone, AutoConfig + + backbone_config = getattr(config, "backbone_config", None) + use_timm_backbone = getattr(config, "use_timm_backbone", None) + use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None) + backbone_checkpoint = getattr(config, "backbone", None) + + # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired + # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config? + if backbone_config is not None and backbone_checkpoint is not None and use_pretrained_backbone is not None: + raise ValueError("Cannot specify both config.backbone_config and config.backbone") + + # If any of thhe following are set, then the config passed in is from a model which contains a backbone. + if ( + backbone_config is None + and use_timm_backbone is None + and backbone_checkpoint is None + and backbone_checkpoint is None + ): + return AutoBackbone.from_config(config=config) + + # config from the parent model that has a backbone + if use_timm_backbone: + if backbone_checkpoint is None: + raise ValueError("config.backbone must be set if use_timm_backbone is True") + # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone + # to determine whether to load the pretrained weights. + backbone = AutoBackbone.from_pretrained( + backbone_checkpoint, use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone + ) + elif use_pretrained_backbone: + if backbone_checkpoint is None: + raise ValueError("config.backbone must be set if use_pretrained_backbone is True") + backbone = AutoBackbone.from_pretrained(backbone_checkpoint) + else: + if backbone_config is None and backbone_checkpoint is None: + raise ValueError("Either config.backbone_config or config.backbone must be set") + if backbone_config is None: + backbone_config = AutoConfig.from_pretrained(backbone_checkpoint) + backbone = AutoBackbone.from_config(config=backbone_config) + return backbone diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index 657b202fbf..0bb9388d59 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -134,6 +134,8 @@ class ConditionalDetrModelTester: num_labels=self.num_labels, use_timm_backbone=False, backbone_config=resnet_config, + backbone=None, + use_pretrained_backbone=False, ) def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index ffb1fc175c..38c42c55c3 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -149,7 +149,9 @@ class DeformableDetrModelTester: encoder_n_points=self.encoder_n_points, decoder_n_points=self.decoder_n_points, use_timm_backbone=False, + backbone=None, backbone_config=resnet_config, + use_pretrained_backbone=False, ) def prepare_config_and_inputs_for_common(self): @@ -518,6 +520,8 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.use_timm_backbone = True + config.backbone_config = None for model_class in self.all_model_classes: model = model_class(config) diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py index 8025e65bd0..d8e16fca49 100644 --- a/tests/models/deta/test_modeling_deta.py +++ b/tests/models/deta/test_modeling_deta.py @@ -157,6 +157,7 @@ class DetaModelTester: assign_first_stage=assign_first_stage, assign_second_stage=assign_second_stage, backbone_config=resnet_config, + backbone=None, ) def prepare_config_and_inputs_for_common(self, model_class_name="DetaModel"): diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index 9578e9cd90..de30d9db9b 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -130,6 +130,8 @@ class DetrModelTester: num_labels=self.num_labels, use_timm_backbone=False, backbone_config=resnet_config, + backbone=None, + use_pretrained_backbone=False, ) def prepare_config_and_inputs_for_common(self): @@ -622,7 +624,7 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase): torch_device ) expected_number_of_segments = 5 - expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994096} + expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994097} number_of_unique_segments = len(torch.unique(results["segmentation"])) self.assertTrue( diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py index aa240f0599..b2408465e4 100644 --- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py +++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -95,6 +95,7 @@ class DPTModelTester: def get_config(self): return DPTConfig( backbone_config=self.get_backbone_config(), + backbone=None, neck_hidden_sizes=self.neck_hidden_sizes, fusion_hidden_size=self.fusion_hidden_size, ) diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index 6898637951..2621c7438b 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -130,6 +130,7 @@ class DPTModelTester: initializer_range=self.initializer_range, is_hybrid=self.is_hybrid, backbone_config=backbone_config, + backbone=None, backbone_featmap_shape=self.backbone_featmap_shape, neck_hidden_sizes=self.neck_hidden_sizes, ) diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py index 1c48469479..fd9a513ab0 100644 --- a/tests/models/mask2former/test_modeling_mask2former.py +++ b/tests/models/mask2former/test_modeling_mask2former.py @@ -114,6 +114,7 @@ class Mask2FormerModelTester: config.backbone_config.hidden_size = 16 config.backbone_config.num_channels = self.num_channels config.backbone_config.num_heads = [1, 1, 2, 2] + config.backbone = None config.hidden_dim = self.hidden_dim config.mask_feature_size = self.hidden_dim diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py index 7e48d76142..16ff3caed4 100644 --- a/tests/models/maskformer/test_modeling_maskformer.py +++ b/tests/models/maskformer/test_modeling_maskformer.py @@ -102,6 +102,7 @@ class MaskFormerModelTester: hidden_size=32, num_heads=[1, 1, 2, 2], ), + backbone=None, decoder_config=DetrConfig( decoder_ffn_dim=64, decoder_layers=self.num_hidden_layers, diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py index cb00170799..538ab33cbf 100644 --- a/tests/models/oneformer/test_modeling_oneformer.py +++ b/tests/models/oneformer/test_modeling_oneformer.py @@ -133,6 +133,7 @@ class OneFormerModelTester: config.backbone_config.hidden_size = 16 config.backbone_config.num_channels = self.num_channels config.backbone_config.num_heads = [1, 1, 2, 2] + config.backbone = None config.hidden_dim = self.hidden_dim config.mask_dim = self.hidden_dim diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 851ef36a1a..bb869d9422 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -131,6 +131,8 @@ class TableTransformerModelTester: num_labels=self.num_labels, use_timm_backbone=False, backbone_config=resnet_config, + backbone=None, + use_pretrained_backbone=False, ) def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py index 14ec02ed6f..c7bcc148a1 100644 --- a/tests/models/tvp/test_modeling_tvp.py +++ b/tests/models/tvp/test_modeling_tvp.py @@ -124,6 +124,7 @@ class TVPModelTester: ) return TvpConfig( backbone_config=resnet_config, + backbone=None, alpha=self.alpha, beta=self.beta, visual_prompter_type=self.visual_prompter_type, diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py index aeeba191b6..c51b254ed5 100644 --- a/tests/models/upernet/test_modeling_upernet.py +++ b/tests/models/upernet/test_modeling_upernet.py @@ -105,6 +105,7 @@ class UperNetModelTester: def get_config(self): return UperNetConfig( backbone_config=self.get_backbone_config(), + backbone=None, hidden_size=64, pool_scales=[1, 2, 3, 6], use_auxiliary_head=True, diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py index 870a4c8335..567394c979 100644 --- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py +++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py @@ -122,6 +122,7 @@ class ViTHybridModelTester: initializer_range=self.initializer_range, backbone_featmap_shape=self.backbone_featmap_shape, backbone_config=backbone_config, + backbone=None, ) def create_and_check_model(self, config, pixel_values, labels): diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py index c9446b116f..c93e82bafb 100644 --- a/tests/models/vitmatte/test_modeling_vitmatte.py +++ b/tests/models/vitmatte/test_modeling_vitmatte.py @@ -111,6 +111,7 @@ class VitMatteModelTester: def get_config(self): return VitMatteConfig( backbone_config=self.get_backbone_config(), + backbone=None, hidden_size=self.hidden_size, fusion_hidden_sizes=self.fusion_hidden_sizes, ) diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py index 488cd46759..0c3ff4866e 100644 --- a/tests/utils/test_backbone_utils.py +++ b/tests/utils/test_backbone_utils.py @@ -16,11 +16,21 @@ import unittest import pytest +from transformers import DetrConfig, MaskFormerConfig +from transformers.testing_utils import require_torch, slow from transformers.utils.backbone_utils import ( BackboneMixin, get_aligned_output_features_output_indices, + load_backbone, verify_out_features_out_indices, ) +from transformers.utils.import_utils import is_torch_available + + +if is_torch_available(): + import torch + + from transformers import BertPreTrainedModel class BackboneUtilsTester(unittest.TestCase): @@ -126,3 +136,75 @@ class BackboneUtilsTester(unittest.TestCase): backbone.out_indices = [-3, -1] self.assertEqual(backbone.out_features, ["a", "c"]) self.assertEqual(backbone.out_indices, [-3, -1]) + + @slow + @require_torch + def test_load_backbone_in_new_model(self): + """ + Tests that new model can be created, with its weights instantiated and pretrained backbone weights loaded. + """ + + # Inherit from PreTrainedModel to ensure that the weights are initialized + class NewModel(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.backbone = load_backbone(config) + self.layer_0 = torch.nn.Linear(config.hidden_size, config.hidden_size) + self.layer_1 = torch.nn.Linear(config.hidden_size, config.hidden_size) + + def get_equal_not_equal_weights(model_0, model_1): + equal_weights = [] + not_equal_weights = [] + for (k0, v0), (k1, v1) in zip(model_0.named_parameters(), model_1.named_parameters()): + self.assertEqual(k0, k1) + weights_are_equal = torch.allclose(v0, v1) + if weights_are_equal: + equal_weights.append(k0) + else: + not_equal_weights.append(k0) + return equal_weights, not_equal_weights + + config = MaskFormerConfig(use_pretrained_backbone=False, backbone="microsoft/resnet-18") + model_0 = NewModel(config) + model_1 = NewModel(config) + equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1) + + # Norm layers are always initialized with the same weights + equal_weights = [w for w in equal_weights if "normalization" not in w] + self.assertEqual(len(equal_weights), 0) + self.assertEqual(len(not_equal_weights), 24) + + # Now we create a new model with backbone weights that are pretrained + config.use_pretrained_backbone = True + model_0 = NewModel(config) + model_1 = NewModel(config) + equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1) + + # Norm layers are always initialized with the same weights + equal_weights = [w for w in equal_weights if "normalization" not in w] + self.assertEqual(len(equal_weights), 20) + # Linear layers are still initialized randomly + self.assertEqual(len(not_equal_weights), 4) + + # Check loading in timm backbone + config = DetrConfig(use_pretrained_backbone=False, backbone="resnet18", use_timm_backbone=True) + model_0 = NewModel(config) + model_1 = NewModel(config) + equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1) + + # Norm layers are always initialized with the same weights + equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w] + self.assertEqual(len(equal_weights), 0) + self.assertEqual(len(not_equal_weights), 24) + + # Now we create a new model with backbone weights that are pretrained + config.use_pretrained_backbone = True + model_0 = NewModel(config) + model_1 = NewModel(config) + equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1) + + # Norm layers are always initialized with the same weights + equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w] + self.assertEqual(len(equal_weights), 20) + # Linear layers are still initialized randomly + self.assertEqual(len(not_equal_weights), 4) diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 2089906086..f1d8b74141 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -219,6 +219,7 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s "out_features", "out_indices", "sampling_rate", + "use_pretrained_backbone", ] attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]