From aafa7ce72b65c730788c122a72a974e464409e9a Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:55:24 +0100 Subject: [PATCH] [`DETR`] Remove timm hardcoded logic in modeling files (#29038) * Enable instantiating model with pretrained backbone weights * Clarify pretrained import * Use load_backbone instead * Add backbone_kwargs to config * Fix up * Add tests * Tidy up * Enable instantiating model with pretrained backbone weights * Update tests so backbone checkpoint isn't passed in * Clarify pretrained import * Update configs - docs and validation check * Update src/transformers/utils/backbone_utils.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Clarify exception message * Update config init in tests * Add test for when use_timm_backbone=True * Use load_backbone instead * Add use_timm_backbone to the model configs * Add backbone_kwargs to config * Pass kwargs to constructors * Draft * Fix tests * Add back timm - weight naming * More tidying up * Whoops * Tidy up * Handle when kwargs are none * Update tests * Revert test changes * Deformable detr test - don't use default * Don't mutate; correct model attributes * Add some clarifying comments * nit - grammar is hard --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../configuration_conditional_detr.py | 14 ++++-- .../modeling_conditional_detr.py | 18 ++++--- .../configuration_deformable_detr.py | 12 ++++- .../modeling_deformable_detr.py | 49 ++++++++++++------- .../models/detr/configuration_detr.py | 14 +++++- src/transformers/models/detr/modeling_detr.py | 16 ++++-- src/transformers/models/dpt/modeling_dpt.py | 6 +-- .../configuration_table_transformer.py | 14 +++++- .../modeling_table_transformer.py | 14 ++++-- .../timm_backbone/modeling_timm_backbone.py | 7 ++- .../test_modeling_conditional_detr.py | 10 ++++ .../test_modeling_deformable_detr.py | 11 ++++- tests/models/detr/test_modeling_detr.py | 11 +++++ .../test_modeling_table_transformer.py | 8 +++ 14 files changed, 156 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py index 945e5edb32..4f95de3582 100644 --- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py +++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py @@ -192,10 +192,16 @@ class ConditionalDetrConfig(PretrainedConfig): if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") - if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: - raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index d8ff371fad..d723d3866e 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -338,12 +338,12 @@ def replace_batch_norm(model): replace_batch_norm(module) -# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder +# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->ConditionalDetr class ConditionalDetrConvEncoder(nn.Module): """ Convolutional backbone, using either the AutoBackbone API or one from the timm library. - nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above. + nn.BatchNorm2d layers are replaced by ConditionalDetrFrozenBatchNorm2d as defined above. """ @@ -352,17 +352,23 @@ class ConditionalDetrConvEncoder(nn.Module): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index 6d32f6220d..3f3ffff69f 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -212,7 +212,16 @@ class DeformableDetrConfig(PretrainedConfig): if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -220,6 +229,7 @@ class DeformableDetrConfig(PretrainedConfig): backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) + self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config self.num_channels = num_channels diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index c0ac7cffc7..7b2bbb9b12 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -88,11 +88,31 @@ def load_cuda_kernels(): if is_vision_available(): from transformers.image_transforms import center_to_corners_format + if is_accelerate_available(): from accelerate import PartialState from accelerate.utils import reduce +if is_timm_available(): + from timm import create_model + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DeformableDetrConfig" +_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" + +DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "sensetime/deformable-detr", + # See all Deformable DETR models at https://huggingface.co/models?filter=deformable-detr +] + + class MultiScaleDeformableAttentionFunction(Function): @staticmethod def forward( @@ -141,21 +161,6 @@ class MultiScaleDeformableAttentionFunction(Function): return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - -if is_timm_available(): - from timm import create_model - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "DeformableDetrConfig" -_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" - - -from ..deprecated._archive_maps import DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST # noqa: F401, E402 - - @dataclass class DeformableDetrDecoderOutput(ModelOutput): """ @@ -420,17 +425,23 @@ class DeformableDetrConvEncoder(nn.Module): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 9b9b5afacd..db180ef1d4 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -193,7 +193,16 @@ class DetrConfig(PretrainedConfig): if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -201,8 +210,9 @@ class DetrConfig(PretrainedConfig): backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) + backbone = None # set timm attributes to None - dilation, backbone, use_pretrained_backbone = None, None, None + dilation = None self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index d7fcdfc5bc..0da702db8b 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -49,9 +49,11 @@ if is_accelerate_available(): if is_scipy_available(): from scipy.optimize import linear_sum_assignment + if is_timm_available(): from timm import create_model + if is_vision_available(): from transformers.image_transforms import center_to_corners_format @@ -345,17 +347,23 @@ class DetrConvEncoder(nn.Module): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index aad3330279..ef6c8bb853 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -1075,10 +1075,10 @@ class DPTForDepthEstimation(DPTPreTrainedModel): super().__init__(config) self.backbone = None - if config.backbone_config is not None and config.is_hybrid is False: - self.backbone = load_backbone(config) - else: + if config.is_hybrid or config.backbone_config is None: self.dpt = DPTModel(config, add_pooling_layer=False) + else: + self.backbone = load_backbone(config) # Neck self.neck = DPTNeck(config) diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index 9a2ff6bbab..4963396024 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -193,7 +193,16 @@ class TableTransformerConfig(PretrainedConfig): if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") - if not use_timm_backbone: + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -201,8 +210,9 @@ class TableTransformerConfig(PretrainedConfig): backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) + backbone = None # set timm attributes to None - dilation, backbone, use_pretrained_backbone = None, None, None + dilation = None self.use_timm_backbone = use_timm_backbone self.backbone_config = backbone_config diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 8e577a65a5..9a684ee121 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -279,17 +279,23 @@ class TableTransformerConvEncoder(nn.Module): self.config = config + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. requires_backends(self, ["timm"]) - kwargs = {} + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (1, 2, 3, 4)) + num_channels = kwargs.pop("in_chans", config.num_channels) if config.dilation: - kwargs["output_stride"] = 16 + kwargs["output_stride"] = kwargs.get("output_stride", 16) backbone = create_model( config.backbone, pretrained=config.use_pretrained_backbone, features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, + out_indices=out_indices, + in_chans=num_channels, **kwargs, ) else: diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py index 0c6fe67b75..e8e0b28e04 100644 --- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py +++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py @@ -63,12 +63,13 @@ class TimmBackbone(PreTrainedModel, BackboneMixin): # We just take the final layer by default. This matches the default for the transformers models. out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,) + in_chans = kwargs.pop("in_chans", config.num_channels) self._backbone = timm.create_model( config.backbone, pretrained=pretrained, # This is currently not possible for transformer architectures. features_only=config.features_only, - in_chans=config.num_channels, + in_chans=in_chans, out_indices=out_indices, **kwargs, ) @@ -79,7 +80,9 @@ class TimmBackbone(PreTrainedModel, BackboneMixin): # These are used to control the output of the model when called. If output_hidden_states is True, then # return_layers is modified to include all layers. - self._return_layers = self._backbone.return_layers + self._return_layers = { + layer["module"]: str(layer["index"]) for layer in self._backbone.feature_info.get_dicts() + } self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)} super()._init_backbone(config) diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index d1152ed862..c3f77614b4 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -444,7 +444,9 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -460,6 +462,14 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline self.model_tester.num_labels, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "ConditionalDetrForSegmentation": + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) self.assertTrue(outputs) diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 7a83c4f1ed..36be099790 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -521,8 +521,9 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" - config.use_timm_backbone = True config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -538,6 +539,14 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT self.model_tester.num_labels, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4) + elif model_class.__name__ == "ConditionalDetrForSegmentation": + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.deformable_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 4) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4) self.assertTrue(outputs) diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index 59b071e031..27092c626d 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -444,6 +444,9 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -459,6 +462,14 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin self.model_tester.num_labels + 1, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "DetrForSegmentation": + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) self.assertTrue(outputs) diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 79da1d1910..d323083eb7 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -456,6 +456,9 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} for model_class in self.all_model_classes: model = model_class(config) @@ -471,6 +474,11 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin self.model_tester.num_labels + 1, ) self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propogated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) self.assertTrue(outputs)