From 9d98706b3f6f14940c713d1a84bac22ef1e083ed Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 10 Jul 2024 14:25:24 +0200
Subject: [PATCH] Fix failed tests in #31851 (#31879)

* Revert "Revert "Fix `_init_weights` for `ResNetPreTrainedModel`" (#31868)"

This reverts commit b45dd5de9c8426db5dbda1797a4790566a278919.

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

* fix

* [test_all] check

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/bit/modeling_bit.py   |  7 ++
 .../models/regnet/modeling_regnet.py          |  8 ++
 .../models/resnet/modeling_resnet.py          |  8 ++
 .../models/rt_detr/modeling_rt_detr_resnet.py |  8 ++
 tests/test_modeling_common.py                 | 97 +++++++++++++++++--
 5 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index d015db4956..e1d1fcda41 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -660,6 +660,13 @@ class BitPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         if isinstance(module, nn.Conv2d):
             nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
         elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
             nn.init.constant_(module.weight, 1)
             nn.init.constant_(module.bias, 0)
diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py
index 2a348c792a..9420fb5eda 100644
--- a/src/transformers/models/regnet/modeling_regnet.py
+++ b/src/transformers/models/regnet/modeling_regnet.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """PyTorch RegNet model."""
 
+import math
 from typing import Optional
 
 import torch
@@ -284,6 +285,13 @@ class RegNetPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         if isinstance(module, nn.Conv2d):
             nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
         elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
             nn.init.constant_(module.weight, 1)
             nn.init.constant_(module.bias, 0)
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index c7cf0e03c7..ccd4fac175 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """PyTorch ResNet model."""
 
+import math
 from typing import Optional
 
 import torch
@@ -274,6 +275,13 @@ class ResNetPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         if isinstance(module, nn.Conv2d):
             nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
         elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
             nn.init.constant_(module.weight, 1)
             nn.init.constant_(module.bias, 0)
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py
index 75102efab3..84427dd240 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py
@@ -17,6 +17,7 @@ PyTorch RTDetr specific ResNet model. The main difference between hugginface Res
 See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L126 for details.
 """
 
+import math
 from typing import Optional
 
 from torch import Tensor, nn
@@ -323,6 +324,13 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         if isinstance(module, nn.Conv2d):
             nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
         elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
             nn.init.constant_(module.weight, 1)
             nn.init.constant_(module.bias, 0)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 299d99280b..d082968ba2 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3167,9 +3167,68 @@ class ModelTesterMixin:
         configs_no_init = _config_zero_init(config)
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES):
+            mappings = [
+                MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+                MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+                MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+                MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
+            ]
+            is_classication_model = any(model_class.__name__ in get_values(mapping) for mapping in mappings)
+
+            if not is_classication_model:
                 continue
 
+            # TODO: ydshieh
+            is_special_classes = model_class.__name__ in [
+                "wav2vec2.masked_spec_embed",
+                "Wav2Vec2ForSequenceClassification",
+                "CLIPForImageClassification",
+                "RegNetForImageClassification",
+                "ResNetForImageClassification",
+                "UniSpeechSatForSequenceClassification",
+                "Wav2Vec2BertForSequenceClassification",
+                "PvtV2ForImageClassification",
+                "Wav2Vec2ConformerForSequenceClassification",
+                "WavLMForSequenceClassification",
+                "SwiftFormerForImageClassification",
+                "SEWForSequenceClassification",
+                "BitForImageClassification",
+                "SEWDForSequenceClassification",
+                "SiglipForImageClassification",
+                "HubertForSequenceClassification",
+                "Swinv2ForImageClassification",
+                "Data2VecAudioForSequenceClassification",
+                "UniSpeechForSequenceClassification",
+                "PvtForImageClassification",
+            ]
+            special_param_names = [
+                r"^bit\.",
+                r"^classifier\.weight",
+                r"^classifier\.bias",
+                r"^classifier\..+\.weight",
+                r"^classifier\..+\.bias",
+                r"^data2vec_audio\.",
+                r"^dist_head\.",
+                r"^head\.",
+                r"^hubert\.",
+                r"^pvt\.",
+                r"^pvt_v2\.",
+                r"^regnet\.",
+                r"^resnet\.",
+                r"^sew\.",
+                r"^sew_d\.",
+                r"^swiftformer\.",
+                r"^swinv2\.",
+                r"^transformers\.models\.swiftformer\.",
+                r"^unispeech\.",
+                r"^unispeech_sat\.",
+                r"^vision_model\.",
+                r"^wav2vec2\.",
+                r"^wav2vec2_bert\.",
+                r"^wav2vec2_conformer\.",
+                r"^wavlm\.",
+            ]
+
             with self.subTest(msg=f"Testing {model_class}"):
                 with tempfile.TemporaryDirectory() as tmp_dir:
                     model = model_class(configs_no_init)
@@ -3177,23 +3236,41 @@ class ModelTesterMixin:
 
                     # Fails when we don't set ignore_mismatched_sizes=True
                     with self.assertRaises(RuntimeError):
-                        new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
+                        new_model = model_class.from_pretrained(tmp_dir, num_labels=42)
 
                     logger = logging.get_logger("transformers.modeling_utils")
 
                     with CaptureLogger(logger) as cl:
-                        new_model = AutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
-                        )
+                        new_model = model_class.from_pretrained(tmp_dir, num_labels=42, ignore_mismatched_sizes=True)
                     self.assertIn("the shapes did not match", cl.out)
 
                     for name, param in new_model.named_parameters():
                         if param.requires_grad:
-                            self.assertIn(
-                                ((param.data.mean() * 1e9).round() / 1e9).item(),
-                                [0.0, 1.0],
-                                msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                            )
+                            param_mean = ((param.data.mean() * 1e9).round() / 1e9).item()
+                            if not (
+                                is_special_classes
+                                and any(len(re.findall(target, name)) > 0 for target in special_param_names)
+                            ):
+                                self.assertIn(
+                                    param_mean,
+                                    [0.0, 1.0],
+                                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                                )
+                            else:
+                                # Here we allow the parameters' mean to be in the range [-5.0, 5.0] instead of being
+                                # either `0.0` or `1.0`, because their initializations are not using
+                                # `config.initializer_factor` (or something similar). The purpose of this test is simply
+                                # to make sure they are properly initialized (to avoid very large value or even `nan`).
+                                self.assertGreaterEqual(
+                                    param_mean,
+                                    -5.0,
+                                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                                )
+                                self.assertLessEqual(
+                                    param_mean,
+                                    5.0,
+                                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                                )
 
     def test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist(self):
         # 1. Create a dummy class. Should have buffers as well? To make sure we test __init__