From 5a95ed5ca0826c867e35e52f698db4d8fc907bcb Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Wed, 18 Jun 2025 09:46:22 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=20Fix=20initialization?= =?UTF-8?q?=20of=20Mask2Former=20(#38864)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Correctly fix init Co-authored-by: BUI Van Tuan * add back the block, breaking BC but this is correct author's code * override the test for params needing it --------- Co-authored-by: BUI Van Tuan --- .../mask2former/modeling_mask2former.py | 20 ++----- src/transformers/utils/backbone_utils.py | 7 +-- .../test_modeling_deformable_detr.py | 17 +++--- .../mask2former/test_modeling_mask2former.py | 56 ++++++++++++++++++- 4 files changed, 68 insertions(+), 32 deletions(-) diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index b1573c1ced..db942a3ac6 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -2127,30 +2127,20 @@ class Mask2FormerPreTrainedModel(PreTrainedModel): for p in module.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p, gain=xavier_std) - - elif isinstance(module, Mask2FormerPixelLevelModule): - for submodule in module.modules(): - if isinstance(submodule, (nn.Conv2d, nn.Linear)): - submodule.weight.data.normal_(mean=0.0, std=std) - if submodule.bias is not None: - submodule.bias.data.zero_() + module.cross_attn.in_proj_bias.data.zero_() elif isinstance(module, Mask2FormerPixelDecoder): - for p in module.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) nn.init.normal_(module.level_embed, std=0) - elif isinstance(module, Mask2FormerPixelDecoderEncoderOnly): - for p in module.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py index 2374455aca..29588f9fe0 100644 --- a/src/transformers/utils/backbone_utils.py +++ b/src/transformers/utils/backbone_utils.py @@ -324,12 +324,7 @@ def load_backbone(config): raise ValueError("Cannot specify both config.backbone_config and config.backbone") # If any of thhe following are set, then the config passed in is from a model which contains a backbone. - if ( - backbone_config is None - and use_timm_backbone is None - and backbone_checkpoint is None - and backbone_checkpoint is None - ): + if backbone_config is None and use_timm_backbone is None and backbone_checkpoint is None: return AutoBackbone.from_config(config=config, **backbone_kwargs) # config from the parent model that has a backbone diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 7c1c7ee1b0..7052b74957 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -590,15 +590,14 @@ class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if param.requires_grad: - if param.requires_grad: - if ( - "level_embed" in name - or "sampling_offsets.bias" in name - or "value_proj" in name - or "output_proj" in name - or "reference_points" in name - ): - continue + if ( + "level_embed" in name + or "sampling_offsets.bias" in name + or "value_proj" in name + or "output_proj" in name + or "reference_points" in name + ): + continue self.assertIn( ((param.data.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py index 475395c11f..5762a1f6ff 100644 --- a/tests/models/mask2former/test_modeling_mask2former.py +++ b/tests/models/mask2former/test_modeling_mask2former.py @@ -18,7 +18,7 @@ import unittest import numpy as np from tests.test_modeling_common import floats_tensor -from transformers import Mask2FormerConfig, is_torch_available, is_vision_available +from transformers import AutoModelForImageClassification, Mask2FormerConfig, is_torch_available, is_vision_available from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 from transformers.testing_utils import ( require_timm, @@ -33,7 +33,7 @@ from transformers.testing_utils import ( from transformers.utils import cached_property from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin +from ...test_modeling_common import ModelTesterMixin, _config_zero_init from ...test_pipeline_mixin import PipelineTesterMixin @@ -350,6 +350,58 @@ class Mask2FormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC elif model.__class__.__name__ == "Mask2FormerForUniversalSegmentation": self.assertEqual(model.model.pixel_level_module.encoder.out_indices, [1, 2, 3]) + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if ( + "self_attn.sampling_offsets.bias" in name + or "self_attn.value_proj.weight" in name + or "self_attn.output_proj.weight" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_initialization_pretrained_backbone(self): + backbone_name = "microsoft/resnet-18" + + # load Mask2Former config with a pretrained backbone + config = Mask2FormerConfig( + backbone=backbone_name, + use_pretrained_backbone=True, + ) + + # load pretrained backbone + backbone_model = AutoModelForImageClassification.from_pretrained(backbone_name, device_map=torch_device) + + def params_match(params1, params2): + return all((p1 == p2).all() for p1, p2 in zip(params1, params2)) + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device).eval() + if model.__class__.__name__ == "Mask2FormerModel": + self.assertTrue( + params_match( + backbone_model.base_model.encoder.parameters(), + model.pixel_level_module.encoder.encoder.parameters(), + ) + ) + elif model.__class__.__name__ == "Mask2FormerForUniversalSegmentation": + self.assertTrue( + params_match( + backbone_model.base_model.encoder.parameters(), + model.model.pixel_level_module.encoder.encoder.parameters(), + ) + ) + TOLERANCE = 1e-4