diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 92dbb006f6..216e4f1d90 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -602,10 +602,6 @@ class _BaseAutoBackboneClass(_BaseAutoModelClass):
 
         config = kwargs.pop("config", TimmBackboneConfig())
 
-        use_timm = kwargs.pop("use_timm_backbone", True)
-        if not use_timm:
-            raise ValueError("`use_timm_backbone` must be `True` for timm backbones")
-
         if kwargs.get("out_features", None) is not None:
             raise ValueError("Cannot specify `out_features` for timm backbones")
 
@@ -627,7 +623,8 @@ class _BaseAutoBackboneClass(_BaseAutoModelClass):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        if kwargs.get("use_timm_backbone", False):
+        use_timm_backbone = kwargs.pop("use_timm_backbone", False)
+        if use_timm_backbone:
             return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 5f9e49db6c..a5cc3d5303 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -93,11 +93,11 @@ class ConditionalDetrConfig(PretrainedConfig):
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
-            backbone from the timm package. For a list of all available models, see [this
-            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -180,6 +180,14 @@ class ConditionalDetrConfig(PretrainedConfig):
         focal_alpha=0.25,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
+            )
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index a6161061d9..e9a4cde2df 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -85,11 +85,11 @@ class DeformableDetrConfig(PretrainedConfig):
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
-            backbone from the timm package. For a list of all available models, see [this
-            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -196,6 +196,14 @@ class DeformableDetrConfig(PretrainedConfig):
         disable_custom_kernels=False,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
+            )
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 8a89a6ddc0..1ade9465a9 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -40,6 +40,12 @@ class DetaConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
         num_queries (`int`, *optional*, defaults to 900):
             Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
             detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
@@ -138,6 +144,8 @@ class DetaConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
         num_queries=900,
         max_position_embeddings=2048,
         encoder_layers=6,
@@ -177,7 +185,13 @@ class DetaConfig(PretrainedConfig):
         focal_alpha=0.25,
         **kwargs,
     ):
-        if backbone_config is None:
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
             backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
         else:
@@ -187,6 +201,8 @@ class DetaConfig(PretrainedConfig):
                 backbone_config = config_class.from_dict(backbone_config)
 
         self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index fadd9ce087..acaf0dfe1e 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -93,11 +93,11 @@ class DetrConfig(PretrainedConfig):
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
-            backbone from the timm package. For a list of all available models, see [this
-            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
-        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `True`):
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -177,6 +177,14 @@ class DetrConfig(PretrainedConfig):
         eos_coefficient=0.1,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
+            )
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index e668bb7f02..0b6366659b 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -111,6 +111,12 @@ class DPTConfig(PretrainedConfig):
         backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
             The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
             leverage the [`AutoBackbone`] API.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
 
     Example:
 
@@ -161,6 +167,8 @@ class DPTConfig(PretrainedConfig):
         backbone_featmap_shape=[1, 1024, 24, 24],
         neck_ignore_stages=[0, 1],
         backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -168,9 +176,15 @@ class DPTConfig(PretrainedConfig):
         self.hidden_size = hidden_size
         self.is_hybrid = is_hybrid
 
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         use_autobackbone = False
         if self.is_hybrid:
-            if backbone_config is None:
+            if backbone_config is None and backbone is None:
                 logger.info("Initializing the config with a `BiT` backbone.")
                 backbone_config = {
                     "global_padding": "same",
@@ -213,6 +227,8 @@ class DPTConfig(PretrainedConfig):
             self.backbone_featmap_shape = None
             self.neck_ignore_stages = []
 
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
         self.num_attention_heads = None if use_autobackbone else num_attention_heads
         self.intermediate_size = None if use_autobackbone else intermediate_size
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index a7ca3dbc50..7202e551a0 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -47,6 +47,12 @@ class Mask2FormerConfig(PretrainedConfig):
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `SwinConfig()`):
             The configuration of the backbone model. If unset, the configuration corresponding to
             `swin-base-patch4-window12-384` will be used.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
         feature_size (`int`, *optional*, defaults to 256):
             The features (channels) of the resulting feature maps.
         mask_feature_size (`int`, *optional*, defaults to 256):
@@ -154,9 +160,17 @@ class Mask2FormerConfig(PretrainedConfig):
         use_auxiliary_loss: bool = True,
         feature_strides: List[int] = [4, 8, 16, 32],
         output_auxiliary_logits: bool = None,
+        backbone=None,
+        use_pretrained_backbone=False,
         **kwargs,
     ):
-        if backbone_config is None:
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
             backbone_config = CONFIG_MAPPING["swin"](
                 image_size=224,
@@ -177,7 +191,7 @@ class Mask2FormerConfig(PretrainedConfig):
             backbone_config = config_class.from_dict(backbone_config)
 
         # verify that the backbone is supported
-        if backbone_config.model_type not in self.backbones_supported:
+        if backbone_config is not None and backbone_config.model_type not in self.backbones_supported:
             logger.warning_once(
                 f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. "
                 f"Supported model types: {','.join(self.backbones_supported)}"
@@ -212,6 +226,8 @@ class Mask2FormerConfig(PretrainedConfig):
         self.feature_strides = feature_strides
         self.output_auxiliary_logits = output_auxiliary_logits
         self.num_hidden_layers = decoder_layers
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index f3d83a0bbf..3d2814dbfd 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -57,6 +57,12 @@ class MaskFormerConfig(PretrainedConfig):
         backbone_config (`Dict`, *optional*):
             The configuration passed to the backbone, if unset, the configuration corresponding to
             `swin-base-patch4-window12-384` will be used.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
         decoder_config (`Dict`, *optional*):
             The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
             will be used.
@@ -114,9 +120,17 @@ class MaskFormerConfig(PretrainedConfig):
         cross_entropy_weight: float = 1.0,
         mask_weight: float = 20.0,
         output_auxiliary_logits: Optional[bool] = None,
+        backbone: Optional[str] = None,
+        use_pretrained_backbone: bool = False,
         **kwargs,
     ):
-        if backbone_config is None:
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             # fall back to https://huggingface.co/microsoft/swin-base-patch4-window12-384-in22k
             backbone_config = SwinConfig(
                 image_size=384,
@@ -136,7 +150,7 @@ class MaskFormerConfig(PretrainedConfig):
             backbone_config = config_class.from_dict(backbone_config)
 
         # verify that the backbone is supported
-        if backbone_config.model_type not in self.backbones_supported:
+        if backbone_config is not None and backbone_config.model_type not in self.backbones_supported:
             logger.warning_once(
                 f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with MaskFormer. "
                 f"Supported model types: {','.join(self.backbones_supported)}"
@@ -177,6 +191,8 @@ class MaskFormerConfig(PretrainedConfig):
 
         self.num_attention_heads = self.decoder_config.encoder_attention_heads
         self.num_hidden_layers = self.decoder_config.num_hidden_layers
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         super().__init__(**kwargs)
 
     @classmethod
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index 672f1dcba8..6cf54947de 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -44,6 +44,12 @@ class OneFormerConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig`, *optional*, defaults to `SwinConfig`):
             The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
         ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
         num_queries (`int`, *optional*, defaults to 150):
@@ -144,6 +150,8 @@ class OneFormerConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config: Optional[Dict] = None,
+        backbone: Optional[str] = None,
+        use_pretrained_backbone: bool = False,
         ignore_value: int = 255,
         num_queries: int = 150,
         no_object_weight: int = 0.1,
@@ -186,7 +194,13 @@ class OneFormerConfig(PretrainedConfig):
         common_stride: int = 4,
         **kwargs,
     ):
-        if backbone_config is None:
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is unset. Initializing the config with the default `Swin` backbone.")
             backbone_config = CONFIG_MAPPING["swin"](
                 image_size=224,
@@ -206,7 +220,8 @@ class OneFormerConfig(PretrainedConfig):
             backbone_config = config_class.from_dict(backbone_config)
 
         self.backbone_config = backbone_config
-
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.ignore_value = ignore_value
         self.num_queries = num_queries
         self.no_object_weight = no_object_weight
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index d79734b383..5a97ce05b3 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -92,12 +92,12 @@ class TableTransformerConfig(PretrainedConfig):
             Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
-        backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
-            backbone from the timm package. For a list of all available models, see [this
-            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
-        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `True`):
+            Whether to use pretrained weights for the backbone.
         dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
             `use_timm_backbone` = `True`.
@@ -178,6 +178,14 @@ class TableTransformerConfig(PretrainedConfig):
         eos_coefficient=0.1,
         **kwargs,
     ):
+        if not use_timm_backbone and use_pretrained_backbone:
+            raise ValueError(
+                "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
+            )
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
 
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index dfb0a5f998..954ee4e90c 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -43,6 +43,12 @@ class TvpConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*):
             The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -95,6 +101,8 @@ class TvpConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
         distance_loss_weight=1.0,
         duration_loss_weight=0.1,
         visual_prompter_type="framepad",
@@ -118,8 +126,13 @@ class TvpConfig(PretrainedConfig):
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
 
-        if backbone_config is None:
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
             backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
         elif isinstance(backbone_config, dict):
@@ -128,6 +141,8 @@ class TvpConfig(PretrainedConfig):
             backbone_config = config_class.from_dict(backbone_config)
 
         self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.distance_loss_weight = distance_loss_weight
         self.duration_loss_weight = duration_loss_weight
         self.visual_prompter_type = visual_prompter_type
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index ba4afad10f..c4e6f8168f 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -36,6 +36,12 @@ class UperNetConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -75,6 +81,8 @@ class UperNetConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
         hidden_size=512,
         initializer_range=0.02,
         pool_scales=[1, 2, 3, 6],
@@ -88,8 +96,13 @@ class UperNetConfig(PretrainedConfig):
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
 
-        if backbone_config is None:
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
             backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
         elif isinstance(backbone_config, dict):
@@ -98,6 +111,8 @@ class UperNetConfig(PretrainedConfig):
             backbone_config = config_class.from_dict(backbone_config)
 
         self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.pool_scales = pool_scales
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 0b8a0da75f..b0a37617dc 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -42,6 +42,12 @@ class ViTHybridConfig(PretrainedConfig):
     Args:
         backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
             The configuration of the backbone in a dictionary or the config object of the backbone.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -92,6 +98,8 @@ class ViTHybridConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -109,8 +117,13 @@ class ViTHybridConfig(PretrainedConfig):
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
 
-        if backbone_config is None:
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.")
             backbone_config = {
                 "global_padding": "same",
@@ -132,6 +145,8 @@ class ViTHybridConfig(PretrainedConfig):
 
         self.backbone_featmap_shape = backbone_featmap_shape
         self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 562abbe5e5..608b606c9b 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -42,6 +42,12 @@ class VitMatteConfig(PretrainedConfig):
     Args:
         backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitDetConfig()`):
             The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -73,6 +79,8 @@ class VitMatteConfig(PretrainedConfig):
     def __init__(
         self,
         backbone_config: PretrainedConfig = None,
+        backbone=None,
+        use_pretrained_backbone=False,
         hidden_size: int = 384,
         batch_norm_eps: float = 1e-5,
         initializer_range: float = 0.02,
@@ -82,7 +90,13 @@ class VitMatteConfig(PretrainedConfig):
     ):
         super().__init__(**kwargs)
 
-        if backbone_config is None:
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
             backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
         elif isinstance(backbone_config, dict):
@@ -91,6 +105,8 @@ class VitMatteConfig(PretrainedConfig):
             backbone_config = config_class.from_dict(backbone_config)
 
         self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.batch_norm_eps = batch_norm_eps
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 3dfccbb77a..22c35c3f9b 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -286,3 +286,56 @@ class BackboneConfigMixin:
         output["out_features"] = output.pop("_out_features")
         output["out_indices"] = output.pop("_out_indices")
         return output
+
+
+def load_backbone(config):
+    """
+    Loads the backbone model from a config object.
+
+    If the config is from the backbone model itself, then we return a backbone model with randomly initialized
+    weights.
+
+    If the config is from the parent model of the backbone model itself, then we load the pretrained backbone weights
+    if specified.
+    """
+    from transformers import AutoBackbone, AutoConfig
+
+    backbone_config = getattr(config, "backbone_config", None)
+    use_timm_backbone = getattr(config, "use_timm_backbone", None)
+    use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None)
+    backbone_checkpoint = getattr(config, "backbone", None)
+
+    # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired
+    # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config?
+    if backbone_config is not None and backbone_checkpoint is not None and use_pretrained_backbone is not None:
+        raise ValueError("Cannot specify both config.backbone_config and config.backbone")
+
+    # If any of thhe following are set, then the config passed in is from a model which contains a backbone.
+    if (
+        backbone_config is None
+        and use_timm_backbone is None
+        and backbone_checkpoint is None
+        and backbone_checkpoint is None
+    ):
+        return AutoBackbone.from_config(config=config)
+
+    # config from the parent model that has a backbone
+    if use_timm_backbone:
+        if backbone_checkpoint is None:
+            raise ValueError("config.backbone must be set if use_timm_backbone is True")
+        # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
+        # to determine whether to load the pretrained weights.
+        backbone = AutoBackbone.from_pretrained(
+            backbone_checkpoint, use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone
+        )
+    elif use_pretrained_backbone:
+        if backbone_checkpoint is None:
+            raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
+        backbone = AutoBackbone.from_pretrained(backbone_checkpoint)
+    else:
+        if backbone_config is None and backbone_checkpoint is None:
+            raise ValueError("Either config.backbone_config or config.backbone must be set")
+        if backbone_config is None:
+            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint)
+        backbone = AutoBackbone.from_config(config=backbone_config)
+    return backbone
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 657b202fbf..0bb9388d59 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -134,6 +134,8 @@ class ConditionalDetrModelTester:
             num_labels=self.num_labels,
             use_timm_backbone=False,
             backbone_config=resnet_config,
+            backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index ffb1fc175c..38c42c55c3 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -149,7 +149,9 @@ class DeformableDetrModelTester:
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
             use_timm_backbone=False,
+            backbone=None,
             backbone_config=resnet_config,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -518,6 +520,8 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
 
         # let's pick a random timm backbone
         config.backbone = "tf_mobilenetv3_small_075"
+        config.use_timm_backbone = True
+        config.backbone_config = None
 
         for model_class in self.all_model_classes:
             model = model_class(config)
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index 8025e65bd0..d8e16fca49 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -157,6 +157,7 @@ class DetaModelTester:
             assign_first_stage=assign_first_stage,
             assign_second_stage=assign_second_stage,
             backbone_config=resnet_config,
+            backbone=None,
         )
 
     def prepare_config_and_inputs_for_common(self, model_class_name="DetaModel"):
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 9578e9cd90..de30d9db9b 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -130,6 +130,8 @@ class DetrModelTester:
             num_labels=self.num_labels,
             use_timm_backbone=False,
             backbone_config=resnet_config,
+            backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -622,7 +624,7 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
             torch_device
         )
         expected_number_of_segments = 5
-        expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994096}
+        expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994097}
 
         number_of_unique_segments = len(torch.unique(results["segmentation"]))
         self.assertTrue(
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index aa240f0599..b2408465e4 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -95,6 +95,7 @@ class DPTModelTester:
     def get_config(self):
         return DPTConfig(
             backbone_config=self.get_backbone_config(),
+            backbone=None,
             neck_hidden_sizes=self.neck_hidden_sizes,
             fusion_hidden_size=self.fusion_hidden_size,
         )
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 6898637951..2621c7438b 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -130,6 +130,7 @@ class DPTModelTester:
             initializer_range=self.initializer_range,
             is_hybrid=self.is_hybrid,
             backbone_config=backbone_config,
+            backbone=None,
             backbone_featmap_shape=self.backbone_featmap_shape,
             neck_hidden_sizes=self.neck_hidden_sizes,
         )
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 1c48469479..fd9a513ab0 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -114,6 +114,7 @@ class Mask2FormerModelTester:
         config.backbone_config.hidden_size = 16
         config.backbone_config.num_channels = self.num_channels
         config.backbone_config.num_heads = [1, 1, 2, 2]
+        config.backbone = None
 
         config.hidden_dim = self.hidden_dim
         config.mask_feature_size = self.hidden_dim
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 7e48d76142..16ff3caed4 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -102,6 +102,7 @@ class MaskFormerModelTester:
                 hidden_size=32,
                 num_heads=[1, 1, 2, 2],
             ),
+            backbone=None,
             decoder_config=DetrConfig(
                 decoder_ffn_dim=64,
                 decoder_layers=self.num_hidden_layers,
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index cb00170799..538ab33cbf 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -133,6 +133,7 @@ class OneFormerModelTester:
         config.backbone_config.hidden_size = 16
         config.backbone_config.num_channels = self.num_channels
         config.backbone_config.num_heads = [1, 1, 2, 2]
+        config.backbone = None
 
         config.hidden_dim = self.hidden_dim
         config.mask_dim = self.hidden_dim
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 851ef36a1a..bb869d9422 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -131,6 +131,8 @@ class TableTransformerModelTester:
             num_labels=self.num_labels,
             use_timm_backbone=False,
             backbone_config=resnet_config,
+            backbone=None,
+            use_pretrained_backbone=False,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py
index 14ec02ed6f..c7bcc148a1 100644
--- a/tests/models/tvp/test_modeling_tvp.py
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -124,6 +124,7 @@ class TVPModelTester:
         )
         return TvpConfig(
             backbone_config=resnet_config,
+            backbone=None,
             alpha=self.alpha,
             beta=self.beta,
             visual_prompter_type=self.visual_prompter_type,
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index aeeba191b6..c51b254ed5 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -105,6 +105,7 @@ class UperNetModelTester:
     def get_config(self):
         return UperNetConfig(
             backbone_config=self.get_backbone_config(),
+            backbone=None,
             hidden_size=64,
             pool_scales=[1, 2, 3, 6],
             use_auxiliary_head=True,
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index 870a4c8335..567394c979 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -122,6 +122,7 @@ class ViTHybridModelTester:
             initializer_range=self.initializer_range,
             backbone_featmap_shape=self.backbone_featmap_shape,
             backbone_config=backbone_config,
+            backbone=None,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
index c9446b116f..c93e82bafb 100644
--- a/tests/models/vitmatte/test_modeling_vitmatte.py
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -111,6 +111,7 @@ class VitMatteModelTester:
     def get_config(self):
         return VitMatteConfig(
             backbone_config=self.get_backbone_config(),
+            backbone=None,
             hidden_size=self.hidden_size,
             fusion_hidden_sizes=self.fusion_hidden_sizes,
         )
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index 488cd46759..0c3ff4866e 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -16,11 +16,21 @@ import unittest
 
 import pytest
 
+from transformers import DetrConfig, MaskFormerConfig
+from transformers.testing_utils import require_torch, slow
 from transformers.utils.backbone_utils import (
     BackboneMixin,
     get_aligned_output_features_output_indices,
+    load_backbone,
     verify_out_features_out_indices,
 )
+from transformers.utils.import_utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import BertPreTrainedModel
 
 
 class BackboneUtilsTester(unittest.TestCase):
@@ -126,3 +136,75 @@ class BackboneUtilsTester(unittest.TestCase):
         backbone.out_indices = [-3, -1]
         self.assertEqual(backbone.out_features, ["a", "c"])
         self.assertEqual(backbone.out_indices, [-3, -1])
+
+    @slow
+    @require_torch
+    def test_load_backbone_in_new_model(self):
+        """
+        Tests that new model can be created, with its weights instantiated and pretrained backbone weights loaded.
+        """
+
+        # Inherit from PreTrainedModel to ensure that the weights are initialized
+        class NewModel(BertPreTrainedModel):
+            def __init__(self, config):
+                super().__init__(config)
+                self.backbone = load_backbone(config)
+                self.layer_0 = torch.nn.Linear(config.hidden_size, config.hidden_size)
+                self.layer_1 = torch.nn.Linear(config.hidden_size, config.hidden_size)
+
+        def get_equal_not_equal_weights(model_0, model_1):
+            equal_weights = []
+            not_equal_weights = []
+            for (k0, v0), (k1, v1) in zip(model_0.named_parameters(), model_1.named_parameters()):
+                self.assertEqual(k0, k1)
+                weights_are_equal = torch.allclose(v0, v1)
+                if weights_are_equal:
+                    equal_weights.append(k0)
+                else:
+                    not_equal_weights.append(k0)
+            return equal_weights, not_equal_weights
+
+        config = MaskFormerConfig(use_pretrained_backbone=False, backbone="microsoft/resnet-18")
+        model_0 = NewModel(config)
+        model_1 = NewModel(config)
+        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
+
+        # Norm layers are always initialized with the same weights
+        equal_weights = [w for w in equal_weights if "normalization" not in w]
+        self.assertEqual(len(equal_weights), 0)
+        self.assertEqual(len(not_equal_weights), 24)
+
+        # Now we create a new model with backbone weights that are pretrained
+        config.use_pretrained_backbone = True
+        model_0 = NewModel(config)
+        model_1 = NewModel(config)
+        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
+
+        # Norm layers are always initialized with the same weights
+        equal_weights = [w for w in equal_weights if "normalization" not in w]
+        self.assertEqual(len(equal_weights), 20)
+        # Linear layers are still initialized randomly
+        self.assertEqual(len(not_equal_weights), 4)
+
+        # Check loading in timm backbone
+        config = DetrConfig(use_pretrained_backbone=False, backbone="resnet18", use_timm_backbone=True)
+        model_0 = NewModel(config)
+        model_1 = NewModel(config)
+        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
+
+        # Norm layers are always initialized with the same weights
+        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
+        self.assertEqual(len(equal_weights), 0)
+        self.assertEqual(len(not_equal_weights), 24)
+
+        # Now we create a new model with backbone weights that are pretrained
+        config.use_pretrained_backbone = True
+        model_0 = NewModel(config)
+        model_1 = NewModel(config)
+        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
+
+        # Norm layers are always initialized with the same weights
+        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
+        self.assertEqual(len(equal_weights), 20)
+        # Linear layers are still initialized randomly
+        self.assertEqual(len(not_equal_weights), 4)
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 2089906086..f1d8b74141 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -219,6 +219,7 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "out_features",
         "out_indices",
         "sampling_rate",
+        "use_pretrained_backbone",
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]