Add AutoBackbone + ResNetBackbone (#20229)

* Add ResNetBackbone * Define channels and strides as property * Remove file * Add test for backbone * Update BackboneOutput class * Remove strides property * Fix docstring * Add backbones to SHOULD_HAVE_THEIR_OWN_PAGE * Fix auto mapping name * Add sanity check for out_features * Set stage names based on depths * Update to tuple Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
2022-11-17 15:43:20 +01:00
parent 904ac21020
commit 6b217c52e6
10 changed files with 160 additions and 2 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -920,6 +920,7 @@ else:
            "MODEL_WITH_LM_HEAD_MAPPING",
            "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
            "AutoModel",
+            "AutoBackbone",
            "AutoModelForAudioClassification",
            "AutoModelForAudioFrameClassification",
            "AutoModelForAudioXVector",
@@ -1877,6 +1878,7 @@ else:
            "ResNetForImageClassification",
            "ResNetModel",
            "ResNetPreTrainedModel",
+            "ResNetBackbone",
        ]
    )
    _import_structure["models.retribert"].extend(
@@ -3946,6 +3948,7 @@ if TYPE_CHECKING:
            MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
            MODEL_MAPPING,
            MODEL_WITH_LM_HEAD_MAPPING,
+            AutoBackbone,
            AutoModel,
            AutoModelForAudioClassification,
            AutoModelForAudioFrameClassification,
@@ -4730,6 +4733,7 @@ if TYPE_CHECKING:
        )
        from .models.resnet import (
            RESNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ResNetBackbone,
            ResNetForImageClassification,
            ResNetModel,
            ResNetPreTrainedModel,
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -1263,3 +1263,16 @@ class XVectorOutput(ModelOutput):
    embeddings: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class BackboneOutput(ModelOutput):
+    """
+    Base class for outputs of backbones.
+
+    Args:
+        feature_maps (`tuple(torch.FloatTensor)` of shape `(batch_size, num_channels, height, width)`):
+            Feature maps of the stages.
+    """
+
+    feature_maps: Tuple[torch.FloatTensor] = None
--- a/src/transformers/models/auto/init.py
+++ b/src/transformers/models/auto/init.py
@@ -73,6 +73,7 @@ else:
        "MODEL_WITH_LM_HEAD_MAPPING",
        "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
        "AutoModel",
+        "AutoBackbone",
        "AutoModelForAudioClassification",
        "AutoModelForAudioFrameClassification",
        "AutoModelForAudioXVector",
@@ -225,6 +226,7 @@ if TYPE_CHECKING:
            MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
            MODEL_MAPPING,
            MODEL_WITH_LM_HEAD_MAPPING,
+            AutoBackbone,
            AutoModel,
            AutoModelForAudioClassification,
            AutoModelForAudioFrameClassification,
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -836,6 +836,13 @@ _MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
    ]
 )

+MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
+    [
+        # Backbone mapping
+        ("resnet", "ResNetBackbone"),
+    ]
+)
+
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
@@ -903,6 +910,8 @@ MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING = _LazyAutoMapping(
 )
 MODEL_FOR_AUDIO_XVECTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES)

+MODEL_FOR_BACKBONE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_BACKBONE_MAPPING_NAMES)
+

 class AutoModel(_BaseAutoModelClass):
    _model_mapping = MODEL_MAPPING
@@ -1126,6 +1135,10 @@ class AutoModelForAudioXVector(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_AUDIO_XVECTOR_MAPPING


+class AutoBackbone(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_BACKBONE_MAPPING
+
+
 AutoModelForAudioXVector = auto_class_update(AutoModelForAudioXVector, head_doc="audio retrieval via x-vector")


--- a/src/transformers/models/resnet/init.py
+++ b/src/transformers/models/resnet/init.py
@@ -36,6 +36,7 @@ else:
        "ResNetForImageClassification",
        "ResNetModel",
        "ResNetPreTrainedModel",
+        "ResNetBackbone",
    ]

 try:
@@ -63,6 +64,7 @@ if TYPE_CHECKING:
    else:
        from .modeling_resnet import (
            RESNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ResNetBackbone,
            ResNetForImageClassification,
            ResNetModel,
            ResNetPreTrainedModel,
--- a/src/transformers/models/resnet/configuration_resnet.py
+++ b/src/transformers/models/resnet/configuration_resnet.py
@@ -58,6 +58,9 @@ class ResNetConfig(PretrainedConfig):
            are supported.
        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`,
+            `"stage3"`, `"stage4"`.

    Example:
    ```python
@@ -85,6 +88,7 @@ class ResNetConfig(PretrainedConfig):
        layer_type="bottleneck",
        hidden_act="relu",
        downsample_in_first_stage=False,
+        out_features=None,
        **kwargs
    ):
        super().__init__(**kwargs)
@@ -97,6 +101,16 @@ class ResNetConfig(PretrainedConfig):
        self.layer_type = layer_type
        self.hidden_act = hidden_act
        self.downsample_in_first_stage = downsample_in_first_stage
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None:
+            if not isinstance(out_features, list):
+                raise ValueError("out_features should be a list")
+            for feature in out_features:
+                if feature not in self.stage_names:
+                    raise ValueError(
+                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
+                    )
+        self.out_features = out_features


 class ResNetOnnxConfig(OnnxConfig):
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -23,12 +23,19 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

 from ...activations import ACT2FN
 from ...modeling_outputs import (
+    BackboneOutput,
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_resnet import ResNetConfig


@@ -416,3 +423,69 @@ class ResNetForImageClassification(ResNetPreTrainedModel):
            return (loss,) + output if loss is not None else output

        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    ResNet backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    RESNET_START_DOCSTRING,
+)
+class ResNetBackbone(ResNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.stage_names = config.stage_names
+        self.resnet = ResNetModel(config)
+
+        self.out_features = config.out_features
+
+        self.out_feature_channels = {
+            "stem": config.embedding_size,
+            "stage1": config.hidden_sizes[0],
+            "stage2": config.hidden_sizes[1],
+            "stage3": config.hidden_sizes[2],
+            "stage4": config.hidden_sizes[3],
+        }
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(self, pixel_values: Optional[torch.FloatTensor] = None) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
+        >>> model = AutoBackbone.from_pretrained("microsoft/resnet-50")
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        ```"""
+        outputs = self.resnet(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        return BackboneOutput(feature_maps=feature_maps)
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -437,6 +437,13 @@ MODEL_MAPPING = None
 MODEL_WITH_LM_HEAD_MAPPING = None


+class AutoBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModel(metaclass=DummyObject):
    _backends = ["torch"]

@@ -4523,6 +4530,13 @@ def load_tf_weights_in_rembert(*args, **kwargs):
 RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = None


+class ResNetBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ResNetForImageClassification(metaclass=DummyObject):
    _backends = ["torch"]

--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -30,7 +30,7 @@ if is_torch_available():
    import torch
    from torch import nn

-    from transformers import ResNetForImageClassification, ResNetModel
+    from transformers import ResNetBackbone, ResNetForImageClassification, ResNetModel
    from transformers.models.resnet.modeling_resnet import RESNET_PRETRAINED_MODEL_ARCHIVE_LIST


@@ -55,6 +55,7 @@ class ResNetModelTester:
        hidden_act="relu",
        num_labels=3,
        scope=None,
+        out_features=["stage1", "stage2", "stage3", "stage4"],
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -69,6 +70,7 @@ class ResNetModelTester:
        self.num_labels = num_labels
        self.scope = scope
        self.num_stages = len(hidden_sizes)
+        self.out_features = out_features

    def prepare_config_and_inputs(self):
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -89,6 +91,7 @@ class ResNetModelTester:
            depths=self.depths,
            hidden_act=self.hidden_act,
            num_labels=self.num_labels,
+            out_features=self.out_features,
        )

    def create_and_check_model(self, config, pixel_values, labels):
@@ -110,6 +113,19 @@ class ResNetModelTester:
        result = model(pixel_values, labels=labels)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))

+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = ResNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [3, 10, 8, 8])
+
+        # verify channels
+        self.parent.assertListEqual(model.channels, config.hidden_sizes)
+
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, pixel_values, labels = config_and_inputs
@@ -176,6 +192,10 @@ class ResNetModelTest(ModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
    def test_initialization(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -47,6 +47,7 @@ PRIVATE_MODELS = [
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
    # models to ignore for not tested
+    "ResNetBackbone",  # Backbones have their own tests.
    "CLIPSegDecoder",  # Building part of bigger (tested) model.
    "TableTransformerEncoder",  # Building part of bigger (tested) model.
    "TableTransformerDecoder",  # Building part of bigger (tested) model.
@@ -668,6 +669,8 @@ SHOULD_HAVE_THEIR_OWN_PAGE = [
    "PyTorchBenchmarkArguments",
    "TensorFlowBenchmark",
    "TensorFlowBenchmarkArguments",
+    "ResNetBackbone",
+    "AutoBackbone",
 ]