From 9858ecd706e3622bacf47e6efbbc0cdf3432ab67 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 8 Dec 2022 17:39:32 +0100 Subject: [PATCH] [`ViTHybrid`] Fix `accelerate` slow tests (#20679) * fix failing `accelerate` tests * make fixup * smaller values * even lower --- .../vit_hybrid/configuration_vit_hybrid.py | 4 +++ .../models/vit_hybrid/modeling_vit_hybrid.py | 9 +++--- .../vit_hybrid/test_modeling_vit_hybrid.py | 32 ++++++++++++++++++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py index 025488d364..abc9920782 100644 --- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py @@ -71,6 +71,8 @@ class ViTHybridConfig(PretrainedConfig): Whether to add a bias to the queries, keys and values. backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `None`): The configuration of the backbone in a dictionary or the config object of the backbone. + backbone_featmap_shape (`List[int]`, *optional*, defaults to `[1, 1024, 24, 24]`): + Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone. Example: @@ -103,6 +105,7 @@ class ViTHybridConfig(PretrainedConfig): image_size=224, patch_size=1, num_channels=3, + backbone_featmap_shape=[1, 1024, 24, 24], qkv_bias=True, **kwargs ): @@ -128,6 +131,7 @@ class ViTHybridConfig(PretrainedConfig): backbone_config_class = BitConfig backbone_config = backbone_config_class(**backbone_config) + self.backbone_featmap_shape = backbone_featmap_shape self.backbone_config = backbone_config self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py index ef32f03ad9..5fe0db3d83 100644 --- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py @@ -166,11 +166,10 @@ class ViTHybridPatchEmbeddings(nn.Module): feature_dim = self.backbone.channels[-1] if feature_size is None: - dummy_image = torch.zeros(1, num_channels, image_size[0], image_size[1]) - with torch.no_grad(): - feature_map = self.backbone(dummy_image).feature_maps[-1] - feature_size = feature_map.shape[-2:] - feature_dim = feature_map.shape[1] + feature_map = config.backbone_featmap_shape + + feature_size = feature_map[-2:] + feature_dim = feature_map[1] else: feature_size = ( feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size) diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py index 8926090e05..cf8d4b48e2 100644 --- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py +++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py @@ -19,7 +19,7 @@ import inspect import unittest from transformers import ViTHybridConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -57,6 +57,7 @@ class ViTHybridModelTester: attention_probs_dropout_prob=0.1, type_sequence_label_size=10, initializer_range=0.02, + backbone_featmap_shape=[1, 16, 4, 4], scope=None, ): self.parent = parent @@ -76,6 +77,7 @@ class ViTHybridModelTester: self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range self.scope = scope + self.backbone_featmap_shape = backbone_featmap_shape # in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) # the number of patches is based on the feature map of the backbone, which by default uses an output stride @@ -95,6 +97,16 @@ class ViTHybridModelTester: return config, pixel_values, labels def get_config(self): + backbone_config = { + "global_padding": "same", + "layer_type": "bottleneck", + "depths": [3, 4, 9], + "out_features": ["stage1", "stage2", "stage3"], + "embedding_dynamic_padding": True, + "hidden_sizes": [4, 8, 16, 32], + "num_groups": 2, + } + return ViTHybridConfig( image_size=self.image_size, patch_size=self.patch_size, @@ -108,6 +120,8 @@ class ViTHybridModelTester: attention_probs_dropout_prob=self.attention_probs_dropout_prob, is_decoder=False, initializer_range=self.initializer_range, + backbone_featmap_shape=self.backbone_featmap_shape, + backbone_config=backbone_config, ) def create_and_check_model(self, config, pixel_values, labels): @@ -229,3 +243,19 @@ class ViTModelIntegrationTest(unittest.TestCase): expected_slice = torch.tensor([-1.9090, -0.4993, -0.2389]).to(torch_device) self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) + + @slow + @require_accelerate + def test_accelerate_inference(self): + feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384") + model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto") + + image = prepare_img() + + inputs = feature_extractor(images=image, return_tensors="pt") + outputs = model(**inputs) + logits = outputs.logits + # model predicts one of the 1000 ImageNet classes + predicted_class_idx = logits.argmax(-1).item() + + self.assertTrue(model.config.id2label[predicted_class_idx], "tabby, tabby cat")