diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py
index 49d27f7789..ff359fa416 100644
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@@ -125,8 +125,6 @@ class MyNewModelConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```
-        new_param (`int`, *optional*, defaults to `False`):
-            A fun new parameter
     """
 
     model_type = "my_new_model"
diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py
index 15865a2c16..c116b55d4d 100644
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@@ -437,32 +437,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
         num_logits_to_keep: int = 0,
     ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, NewTaskModelForNewTask
-
-        >>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224")
-        >>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224")
-
-        >>> prompt = "Where is the cat standing?"
-        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs,)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Where is the cat standing?\nsnow"
-        ```
         Returns:
         """
         vlm_outputs = super().forward(
diff --git a/examples/modular-transformers/modular_my_new_model.py b/examples/modular-transformers/modular_my_new_model.py
index c1ea8b0a72..58b74cd7eb 100644
--- a/examples/modular-transformers/modular_my_new_model.py
+++ b/examples/modular-transformers/modular_my_new_model.py
@@ -2,11 +2,122 @@ from transformers.models.llama.configuration_llama import LlamaConfig
 
 
 # Example where we only want to only add a new config argument and new arg doc
-# here there is no `ARG` so we are gonna take parent doc
 class MyNewModelConfig(LlamaConfig):
     r"""
-    new_param (`int`, *optional*, defaults to `False`):
-        A fun new parameter
+    This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the MyNewModel-7B.
+    e.g. [meta-my_new_model/MyNewModel-2-7b-hf](https://huggingface.co/meta-my_new_model/MyNewModel-2-7b-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the MyNewModel model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MyNewModelModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens,
+            MyNewModel 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'my_new_model3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'my_new_model3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'my_new_model3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'my_new_model3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
+
+    ```python
+    >>> from transformers import MyNewModelModel, MyNewModelConfig
+
+    >>> # Initializing a MyNewModel my_new_model-7b style configuration
+    >>> configuration = MyNewModelConfig()
+
+    >>> # Initializing a model from the my_new_model-7b style configuration
+    >>> model = MyNewModelModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
     """
 
     def __init__(self, mlp_bias=True, new_param=0, **super_kwargs):
diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py
index 068519dbf4..acadb52872 100644
--- a/src/transformers/models/d_fine/modeling_d_fine.py
+++ b/src/transformers/models/d_fine/modeling_d_fine.py
@@ -1674,21 +1674,7 @@ class DFineForObjectDetection(DFinePreTrainedModel):
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]:
-        r"""
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
-            can choose to directly pass a flattened representation of an image.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
-            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
-            embedded representation.
-        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
-            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
-            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
-            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
-            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
-
-        Examples:
-
+        """
         ```python
         >>> import torch
         >>> from transformers.image_utils import load_image
@@ -1729,7 +1715,8 @@ class DFineForObjectDetection(DFinePreTrainedModel):
         Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33]
         Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57]
         Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74]
-        ```"""
+        ```
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py
index 51e3c4512e..4c36cc0f85 100644
--- a/src/transformers/models/diffllama/modeling_diffllama.py
+++ b/src/transformers/models/diffllama/modeling_diffllama.py
@@ -729,11 +729,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
index adbb34c2fd..eaa2fd313f 100644
--- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -746,7 +746,7 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> BackboneOutput:
-        r"""
+        """
         Examples:
 
         ```python
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index 4d050a0bbb..24a26f3777 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -1292,11 +1292,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
index 181167a9aa..d156dcc94e 100644
--- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py
+++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
@@ -1530,11 +1530,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
         **kwargs,
     ) -> Union[tuple, CausalLMOutputWithPast]:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 13a65ae661..85d20faf52 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -479,11 +479,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index 506fda8f7f..25bdfb7ab8 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -553,11 +553,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
         **kwargs,
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
index 4560ff8cf3..38b108bc87 100644
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -658,11 +658,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
         **kwargs,
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py
index 6fa6140716..1c98bc5000 100644
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@@ -1830,11 +1830,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
         **kwargs,
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index 5a0f5d94d8..534f744fc3 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -39,13 +39,13 @@ class Glm4vImagesKwargs(ImagesKwargs):
 
 
 class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Glm4vImagesKwargs
-    videos_kwargs: Glm4vVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
         },
     }
+    images_kwargs: Glm4vImagesKwargs
+    videos_kwargs: Glm4vVideosProcessorKwargs
 
 
 class Glm4vProcessor(ProcessorMixin):
diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py
index 354e9dbd5f..51fc1f07e9 100644
--- a/src/transformers/models/helium/modeling_helium.py
+++ b/src/transformers/models/helium/modeling_helium.py
@@ -472,11 +472,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index 12cbdd7933..30023430ec 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1501,41 +1501,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
         r"""
-        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
-            to serve as text prompt, which the Q-Former model will encode.
-
-            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
-            details.
-
-            [What are input IDs?](../glossary#input-ids)
-        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
-            provided to serve as text prompt, which the language model can continue.
-
-            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
-            details.
-
-            [What are input IDs?](../glossary#input-ids)
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-
-            Only relevant in case an encoder-decoder language model (like T5) is used.
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
-            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size]`
-
-        Examples:
-
         ```python
         >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
         >>> import torch
diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py
index e4ce2cc7b7..718300161f 100644
--- a/src/transformers/models/internvl/modeling_internvl.py
+++ b/src/transformers/models/internvl/modeling_internvl.py
@@ -901,11 +901,6 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, InternVLCausalLMOutputWithPast]:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py
index 23c212d2d3..73d3bf9402 100644
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@@ -223,12 +223,11 @@ def apply_rotary_pos_emb_vision(
 
 
 class MLCDAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper
-    Multi-headed attention with RoPE. Refer to papers:
-        - Attention is all you need:
-            https://huggingface.co/papers/1706.03762
-        - RoFormer: Enhanced Transformer with Rotary Position Embedding:
-            https://huggingface.co/papers/2104.09864
+    """Multi-headed attention with RoPE. Refer to papers:
+    - Attention is all you need:
+        https://huggingface.co/papers/1706.03762
+    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
+        https://huggingface.co/papers/2104.09864
     """
 
     def __init__(self, config: MLCDVisionConfig):
diff --git a/src/transformers/models/sam_hq/configuration_sam_hq.py b/src/transformers/models/sam_hq/configuration_sam_hq.py
index 1322bc560d..7987510b88 100644
--- a/src/transformers/models/sam_hq/configuration_sam_hq.py
+++ b/src/transformers/models/sam_hq/configuration_sam_hq.py
@@ -221,8 +221,6 @@ class SamHQMaskDecoderConfig(PretrainedConfig):
             The dimensionality of the hidden states in the IoU head module.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-
-
         vit_dim (`int`, *optional*, defaults to 768):
             Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
     """
diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py
index 5dc501dc80..67772cb6c4 100644
--- a/src/transformers/models/sam_hq/modular_sam_hq.py
+++ b/src/transformers/models/sam_hq/modular_sam_hq.py
@@ -77,8 +77,37 @@ class SamHQVisionConfig(SamVisionConfig):
 
 class SamHQMaskDecoderConfig(SamMaskDecoderConfig):
     r"""
-    vit_dim (`int`, *optional*, defaults to 768):
-        Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
+    This is the configuration class to store the configuration of a [`SamHQMaskDecoder`]. It is used to instantiate a SAM_HQ
+    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
+    will yield a similar configuration to that of the SAM_HQ-vit-h
+    [facebook/sam_hq-vit-huge](https://huggingface.co/facebook/sam_hq-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden states.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function used inside the `SamHQMaskDecoder` module.
+        mlp_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 2):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_downsample_rate (`int`, *optional*, defaults to 2):
+            The downsampling rate of the attention layer.
+        num_multimask_outputs (`int`, *optional*, defaults to 3):
+            The number of outputs from the `SamHQMaskDecoder` module. In the Segment Anything paper, this is set to 3.
+        iou_head_depth (`int`, *optional*, defaults to 3):
+            The number of layers in the IoU head module.
+        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
+            The dimensionality of the hidden states in the IoU head module.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        vit_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
     """
 
     def __init__(
diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py
index f6a8b6ac46..bbe5a98154 100644
--- a/src/transformers/models/smolvlm/modeling_smolvlm.py
+++ b/src/transformers/models/smolvlm/modeling_smolvlm.py
@@ -874,16 +874,6 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
         r"""
-        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
-            Mask to avoid performing attention on padding pixel indices.
-        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
-            The hidden states of the image encoder after modality projection.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
-            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
-            computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
         Example:
 
         ```python
diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py
index 2280848735..ff51537d81 100644
--- a/src/transformers/models/t5gemma/configuration_t5gemma.py
+++ b/src/transformers/models/t5gemma/configuration_t5gemma.py
@@ -26,81 +26,80 @@ from ...configuration_utils import PretrainedConfig, layer_type_validation
 
 class T5GemmaModuleConfig(PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
-        model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-        defaults will yield a similar configuration to that of the T5GemmaModule-7B.
-        e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
-        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-        documentation from [`PretrainedConfig`] for more information.
-        Args:
-            vocab_size (`int`, *optional*, defaults to 256000):
-                Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
-                `inputs_ids` passed when calling [`T5GemmaModuleModel`]
-            hidden_size (`int`, *optional*, defaults to 2304):
-                Dimension of the hidden representations.
-            intermediate_size (`int`, *optional*, defaults to 9216):
-                Dimension of the MLP representations.
-            num_hidden_layers (`int`, *optional*, defaults to 26):
-                Number of hidden layers in the Transformer decoder.
-            num_attention_heads (`int`, *optional*, defaults to 8):
-                Number of attention heads for each attention layer in the Transformer decoder.
-            num_key_value_heads (`int`, *optional*, defaults to 4):
-                This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-                `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-                `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-                converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-                by meanpooling all the original heads within that group. For more details, check out [this
-                paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
-                `num_attention_heads`.
-            head_dim (`int`, *optional*, defaults to 256):
-                The attention head dimension.
-            hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-                The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
-                if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
-            max_position_embeddings (`int`, *optional*, defaults to 8192):
-                The maximum sequence length that this model might ever be used with.
-            initializer_range (`float`, *optional*, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-                The epsilon used by the rms normalization layers.
-            use_cache (`bool`, *optional*, defaults to `True`):
-                Whether or not the model should return the last key/values attentions (not used by all models). Only
-                relevant if `config.is_decoder=True`.
-            pad_token_id (`int`, *optional*, defaults to 0):
-                Padding token id.
-            eos_token_id (`int`, *optional*, defaults to 1):
-                End of stream token id.
-            bos_token_id (`int`, *optional*, defaults to 2):
-                Beginning of stream token id.
-            tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-                Whether to tie weight embeddings
-            rope_theta (`float`, *optional*, defaults to 10000.0):
-                The base period of the RoPE embeddings.
-            attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-                Whether to use a bias in the query, key, value and output projection layers during self-attention.
-            attention_dropout (`float`, *optional*, defaults to 0.0):
-                The dropout ratio for the attention probabilities.
-            query_pre_attn_scalar (`float`, *optional*, defaults to 256):
-                scaling factor used on the attention scores
-            sliding_window (`int`, *optional*, defaults to 4096):
-                in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
-            layer_types (`list`, *optional*):
-                Attention pattern for each layer.
-            final_logit_softcapping (`float`, *optional*, defaults to 30.0):
-                scaling factor when applying tanh softcapping on the logits.
-            attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
-                scaling factor when applying tanh softcapping on the attention scores.
+    This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the T5GemmaModule-7B.
+    e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`T5GemmaModuleModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
+            scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
+            scaling factor when applying tanh softcapping on the attention scores.
 
-        ```python
-        >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
-        >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
-        >>> configuration = T5GemmaModuleConfig()
-        >>> # Initializing a model from the t5_gemma_module-7b style configuration
-        >>> model = T5GemmaModuleModel(configuration)
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-        ```
-    Module config (encoder or decoder): the same as Gemma2Config."""
+    ```python
+    >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
+    >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
+    >>> configuration = T5GemmaModuleConfig()
+    >>> # Initializing a model from the t5_gemma_module-7b style configuration
+    >>> model = T5GemmaModuleModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "t5_gemma_module"
     keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
index 603d485359..56533855ee 100644
--- a/src/transformers/models/t5gemma/modular_t5gemma.py
+++ b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -68,10 +68,7 @@ logger = logging.get_logger(__name__)
 
 
 class T5GemmaModuleConfig(Gemma2Config):
-    """Module config (encoder or decoder): the same as Gemma2Config."""
-
-    def __init__(self, **super_kwargs):
-        super().__init__(**super_kwargs)
+    pass
 
 
 class T5GemmaConfig(PretrainedConfig):
diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py
index 13df73c09e..c565808363 100644
--- a/src/transformers/models/zamba2/modeling_zamba2.py
+++ b/src/transformers/models/zamba2/modeling_zamba2.py
@@ -319,17 +319,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 
 class Zamba2Attention(nn.Module):
     """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-
-    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
-    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
-    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
-    (see fig. 2 in https://huggingface.co/papers/2405.16712).
-    Additionally, replaced
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
-
     Multi-headed attention from 'Attention Is All You Need' paper.
 
     Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
diff --git a/tests/repo_utils/modular/test_conversion_order.py b/tests/repo_utils/modular/test_conversion_order.py
index 7f44684b7b..265873c604 100644
--- a/tests/repo_utils/modular/test_conversion_order.py
+++ b/tests/repo_utils/modular/test_conversion_order.py
@@ -23,7 +23,6 @@ FILES_TO_PARSE = [
     os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"),
     os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"),
     os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3.py"),
-    os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3_moe.py"),
     os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"),
     os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"),
     os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"),
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 187fb60afb..7585695e65 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -249,90 +249,13 @@ class ReplaceMethodCallTransformer(cst.CSTTransformer):
         return updated_node
 
 
-def get_docstring_indent(docstring):
-    # Match the first line after the opening triple quotes
-    match = re.search(r'(?:"""|\'\'\'|```)\n(\s+)', docstring)
-    if match:
-        # Return the indentation spaces captured
-        return len(match.group(1))
-    return 0
-
-
-def is_full_docstring(original_docstring: str, new_docstring: str, original_level: int) -> bool:
-    """Check if `new_docstring` is a full docstring, or if it is only part of a docstring that should then
-    be merged with the existing old one.
-    """
-    # libcst returns the docstrinbgs with literal `r"""` quotes in front
-    new_docstring = new_docstring.split('"""', 1)[1]
-    # The docstring contains Args definition, so it is self-contained
-    if re.search(r"\n\s*Args:\n", new_docstring):
-        return True
-    elif re.search(r"\n\s*Args:\n", original_docstring):
-        return False
-    # Check if the docstring contains args docstring (meaning it is self contained):
-    param_pattern = re.compile(
-        # |--- Group 1 ---|| Group 2 ||- Group 3 -||---------- Group 4 ----------|
-        rf"^\s{{0,{original_level}}}(\w+)\s*\(\s*([^, \)]*)(\s*.*?)\s*\)\s*:\s*((?:(?!\n^\s{{0,{original_level}}}\w+\s*\().)*)",
-        re.DOTALL | re.MULTILINE,
-    )
-    match_object = param_pattern.search(new_docstring)
-    if match_object is not None:
-        return True
-    # If it contains Returns, but starts with text indented with an additional 4 spaces before, it is self-contained
-    # (this is the scenario when using `@add_start_docstrings_to_model_forward`, but adding more args to docstring)
-    match_object = re.search(r"\n([^\S\n]*)Returns:\n", new_docstring)
-    if match_object is not None:
-        full_indent = match_object.group(1)
-        striped_doc = new_docstring.strip("\n")
-        if striped_doc.startswith(full_indent + " " * 4) or striped_doc.startswith(full_indent + "\t"):
-            return True
-    return False
-
-
-def merge_docstrings(original_docstring, updated_docstring):
-    original_level = get_docstring_indent(original_docstring)
-    if not is_full_docstring(original_docstring, updated_docstring, original_level):
-        # Split the docstring at the example section, assuming `"""` is used to define the docstring
-        parts = original_docstring.split("```")
-        if "```" in updated_docstring and len(parts) > 1:
-            updated_docstring = updated_docstring.lstrip('r"')
-            new_parts = updated_docstring.split("```")
-            if len(new_parts) != 3:
-                raise ValueError("There should only be one example, and it should have opening and closing '```'")
-            parts[1] = new_parts[1]
-            updated_docstring = "".join(
-                [
-                    f"\n{original_level * ' '}```",
-                    parts[1],
-                    "```",
-                    parts[2],
-                ]
-            )
-            docstring_opening, original_start_docstring = parts[0].rstrip(" \n").split('"""')[:2]
-            new_start_docstring = new_parts[0].rstrip(" \n")
-            docstring_opening += '"""'
-            if new_start_docstring.startswith(original_start_docstring):
-                updated_docstring = new_start_docstring + "\n" + updated_docstring
-            elif original_start_docstring.endswith(new_start_docstring):
-                updated_docstring = original_start_docstring + "\n" + updated_docstring
-            else:
-                updated_docstring = original_start_docstring + "\n" + new_start_docstring + "\n" + updated_docstring
-            updated_docstring = docstring_opening + updated_docstring
-        elif updated_docstring not in original_docstring:
-            # add tabulation if we are at the lowest level.
-            if re.search(r"\n\s*.*\(.*\)\:\n\s*\w", updated_docstring):
-                updated_docstring = updated_docstring.replace("\n    ", "\n        ")
-            updated_docstring = original_docstring.rstrip('"') + "\n" + updated_docstring.lstrip('r"\n')
-    return updated_docstring
-
-
 class SuperTransformer(cst.CSTTransformer):
     METADATA_DEPENDENCIES = (ParentNodeProvider,)
 
-    def __init__(self, python_module: cst.Module, original_methods, updated_methods, all_bases=None):
+    def __init__(self, python_module: cst.Module, original_modeling_methods, modular_methods, all_bases=None):
         self.python_module = python_module
-        self.original_methods = original_methods
-        self.updated_methods = updated_methods
+        self.original_modeling_methods = original_modeling_methods
+        self.modular_methods = modular_methods
         self.all_assign_target = {}
         self.deleted_targets = {}  # child node can delete some arguments
         self.all_bases = all_bases or []
@@ -414,53 +337,39 @@ class SuperTransformer(cst.CSTTransformer):
                 break
         return new_body
 
-    def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CSTNode:
+    def replace_super_calls(self, node: cst.BaseSuite, func_name: str) -> cst.BaseSuite:
         """Updates the body of the input `node`'s `func_name` function by replacing calls
         to super().func_name() with the source code of the parent class' `func_name`.
         It keeps everything that is defined before `super().func_name()`.
         """
-        self.has_docstring = False
-        parent_has_docstring = False
-        if func_name in self.original_methods:
-            parent_has_docstring = m.matches(self.original_methods[func_name].body.body[0], DOCSTRING_NODE)
         new_body = []
-        has_super_call = False
+        modular_node_body = node.body
 
-        for i, expr in enumerate(node.body):
+        for i, expr in enumerate(modular_node_body):
             if is_call_to_super(expr, func_name):
-                has_super_call = True
-                new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body[i + 1 :]))
+                original_modeling_method_body = self.original_modeling_methods[func_name].body.body
+                new_body.extend(self.update_body(original_modeling_method_body, modular_node_body[i + 1 :]))
                 new_body = self._fix_init_location(new_body)
+                return node.with_changes(body=new_body)
             else:
                 expr = expr.visit(self.transformer)
-            if m.matches(expr, DOCSTRING_NODE):
-                self.has_docstring = True
-                if parent_has_docstring:  # actually here we ought to de-duplicate?
-                    original_docstring = self.original_methods[func_name].body.body[0].body[0].value.value
-                    updated_docstring = expr.body[0].value.value
-                    merged_doc = merge_docstrings(original_docstring, updated_docstring)
-                    new_node = [expr.with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])]
-                else:
-                    new_node = [expr]
-                new_body.extend(new_node)
-            elif not m.matches(expr, m.SimpleStatementLine(body=[m.Del()])) and not has_super_call:
+            if not m.matches(expr, m.SimpleStatementLine(body=[m.Del()])):
                 new_body.append(expr)
-        if not self.has_docstring and parent_has_docstring:
-            new_body = [self.original_methods[func_name].body.body[0]] + new_body
+
         return node.with_changes(body=new_body)
 
-    def leave_FunctionDef(self, original_node: cst.Call, updated_node: cst.Call) -> cst.CSTNode:
-        if updated_node.name.value in self.updated_methods:
-            name = updated_node.name.value
+    def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef) -> cst.FunctionDef:
+        name = updated_node.name.value
+        if name in self.modular_methods:
             new_body = self.replace_super_calls(updated_node.body, name)
             return updated_node.with_changes(body=new_body, params=updated_node.params)
         return updated_node
 
-    def leave_Return(self, original_node: cst.Return, updated_node: cst.Return) -> cst.CSTNode:
+    def leave_Return(self, original_node: cst.Return, updated_node: cst.Return) -> cst.Return:
         """ "When a return statement is reached, it is replaced with the unrolled super code"""
         if m.matches(updated_node.value, m.Call(func=m.Attribute(attr=m.Name("super")))):
             func_def = self.get_metadata(ParentNodeProvider, original_node)
-            if m.matched(func_def, m.FunctionDef()) and func_def.name.value in self.original_methods:
+            if m.matched(func_def, m.FunctionDef()) and func_def.name.value in self.original_modeling_methods:
                 updated_return_value = updated_node.value.with_changes(
                     args=[
                         cst.Arg(
@@ -979,55 +888,52 @@ def common_partial_suffix(str1: str, str2: str) -> str:
 
 
 def replace_class_node(
-    mapper: ModelFileMapper, class_node: cst.ClassDef, renamed_super_class: str, original_super_class: str
-):
+    mapper: ModelFileMapper, modular_class_node: cst.ClassDef, renamed_super_class: str, original_super_class: str
+) -> cst.ClassDef:
     """
     Replace a class node which inherits from another modeling class. This function works in the following way:
-    - start from the base class node of the inherited class (a cst.Node)
-    - replace all methods of the base node with the methods defined in the child class
-    - append all new methods defined in the child class
+    - start from the methods and class attributes of the original modeling code node, and replace their definition
+    if overriden in the modular
+    - append all new methods and class attributes defined in the child class
+    - all potential method/class docstrings and decorators use the ones found in modular if any, else in original modeling
     - replace all calls to super() with the unravelled code
 
-                    |    ```python                          |               |    ```python
-                    |    class GemmaModel(LlamaModel):      |               |       class GemmaModel(nn.Module):
-                    |        def __init__(self):            |               |           def __init__(self):
-    Going from:     |            super().__init__()         |       to:     |               super().__init__(config)
-                    |            self.dropout = 0.2         |               |               self.dropout = 0.2
-                    |     ```                               |               |               self.padding_idx = config.pad_token_id
-                                                                            |               self.vocab_size = config.vocab_size
-                                                                            |               self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-                                                                            |               self.layers = nn.ModuleList(
-                                                                            |                   [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-                                                                            |               )
-                                                                            |               self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-                                                                            |               self.gradient_checkpointing = False
-                                                                            |               # Initialize weights and apply final processing
-                                                                            |               self.post_init()
-                                                                            |     ```
-    """
-    all_bases = [get_full_attribute_name(k.value) for k in class_node.bases]
-    if any(base is None for base in all_bases):
-        raise ValueError(f"Could not parse the name of the bases for {class_node.name.value}")
+    Args:
+        mapper (`ModelFileMapper`):
+            The mapper corresponding to the visited file from which the modular class node inherits.
+        modular_class_node (`cst.ClassDef`):
+            The class node as found in the modular file.
+        renamed_super_class (`str`):
+            The name of the class from which `modular_class_node` inherits after automatic renaming.
+        original_super_class (`str`):
+            The name of the class from which `modular_class_node` inherits before automatic renaming.
 
-    original_node = mapper.classes[renamed_super_class]
+    Returns:
+        A new class node corresponding to the modular definition.
+    """
+    all_bases = [get_full_attribute_name(k.value) for k in modular_class_node.bases]
+    if any(base is None for base in all_bases):
+        raise ValueError(f"Could not parse the name of the bases for {modular_class_node.name.value}")
+
+    original_modeling_node = mapper.classes[renamed_super_class]
     # Always use the new name of the class (in case we use e.g. `ColPaliForRetrieval` inheriting from `PaliGemmaForConditionalGeneration`)
-    new_name = class_node.name
+    new_class_name = modular_class_node.name
 
     # If the new class name is different from the renamed super class name, we need to update the docstrings/comments accordingly
-    if new_name.value != renamed_super_class:
-        common_suffix = common_partial_suffix(new_name.value, renamed_super_class)
+    if new_class_name.value != renamed_super_class:
+        common_suffix = common_partial_suffix(new_class_name.value, renamed_super_class)
         # Note that this works even without common prefix, in which case it does not replace anything
-        old, new = renamed_super_class.replace(common_suffix, ""), new_name.value.replace(common_suffix, "")
-        temp_module = cst.Module(body=[original_node])
-        original_node = temp_module.visit(
+        old, new = renamed_super_class.replace(common_suffix, ""), new_class_name.value.replace(common_suffix, "")
+        temp_module = cst.Module(body=[original_modeling_node])
+        original_modeling_node = temp_module.visit(
             ReplaceNameTransformer(get_lowercase_name(old), get_lowercase_name(new), only_doc=True)
         ).body[0]
 
     # If we explicitly passed a new base with common suffix to an old base, it is for switching the prefix
     # e.g. if the "natural" parent class is `PreTrainedModel` but we wanted to rename it to `PreTrainedVisionModel`
     additional_bases = [base for base in all_bases if base != original_super_class]
-    new_bases = []
-    for original_base in original_node.bases:
+    new_class_bases = []
+    for original_base in original_modeling_node.bases:
         new_base = original_base
         # we only potentially switch base for Name-based bases, not Attribute
         if m.matches(original_base.value, m.Name()):
@@ -1038,106 +944,125 @@ def replace_class_node(
                     new_name_node = original_base.value.with_changes(value=additional_base_name)
                     new_base = original_base.with_changes(value=new_name_node)
                     break
-        new_bases.append(new_base)
+        new_class_bases.append(new_base)
 
-    original_methods = {
-        f.name.value if hasattr(f, "name") else mapper.python_module.code_for_node(f): f
-        for f in original_node.body.body
+    # Use class decorators redefined in modular file if any
+    new_class_decorators = (
+        modular_class_node.decorators if len(modular_class_node.decorators) > 0 else original_modeling_node.decorators
+    )
+
+    # Compute new class docstring
+    original_modeling_docstring = [
+        node for node in original_modeling_node.body.body if m.matches(node, DOCSTRING_NODE)
+    ]
+    modular_docstring = [node for node in modular_class_node.body.body if m.matches(node, DOCSTRING_NODE)]
+    # Use class docstring in modular if any, else original modeling code docstring
+    new_class_docstring = modular_docstring if len(modular_docstring) > 0 else original_modeling_docstring
+
+    # Compute new class attributes
+    original_modeling_class_attributes = {
+        node.body[0].targets[0].target.value: node
+        for node in original_modeling_node.body.body
+        if m.matches(node, m.SimpleStatementLine(body=[m.Assign()]))
     }
-    updated_methods = {
-        f.name.value if hasattr(f, "name") else mapper.python_module.code_for_node(f): f for f in class_node.body.body
+    original_modeling_class_attributes.update(
+        {
+            node.body[0].target.value: node
+            for node in original_modeling_node.body.body
+            if m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()]))
+        }
+    )
+    modular_class_attributes = {
+        node.body[0].targets[0].target.value: node
+        for node in modular_class_node.body.body
+        if m.matches(node, m.SimpleStatementLine(body=[m.Assign()]))
     }
-    end_meth = []
+    modular_class_attributes.update(
+        {
+            node.body[0].target.value: node
+            for node in modular_class_node.body.body
+            if m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()]))
+        }
+    )
+    # Use all original modeling attributes, and potentially override some with values in the modular
+    new_class_attributes = list({**original_modeling_class_attributes, **modular_class_attributes}.values())
 
-    assign_targets = {}
-    docstring_node = []
-    # Iterate directly from node.body as there can be property/setters with same names which are overwritten when we use a dict
-    for func in original_node.body.body:
-        name = func.name.value if hasattr(func, "name") else mapper.python_module.code_for_node(func)
-        if m.matches(func, m.FunctionDef()) and name in updated_methods and updated_methods[name] is not None:
-            new_params = updated_methods[name].params
-            # Replace the method in the replacement class, preserving decorators
-            kwarg_name = getattr(updated_methods[name].params, "star_kwarg", None)
-            if kwarg_name and kwarg_name.name.value == "super_kwargs":
-                parent_params = {k.name.value: k for k in func.params.params}
-                parent_params.update({k.name.value: k for k in new_params.params[1:]})
-                new_params = new_params.with_changes(
-                    params=list(parent_params.values()), star_kwarg=func.params.star_kwarg
-                )
-            # Keep decorators in `modular_xxx.py` if any, else original decorators
-            new_decorators = (
-                updated_methods[name].decorators if len(updated_methods[name].decorators) > 0 else func.decorators
-            )
+    original_modeling_methods = {
+        node.name.value: node for node in original_modeling_node.body.body if m.matches(node, m.FunctionDef())
+    }
+    modular_methods = {
+        node.name.value: node for node in modular_class_node.body.body if m.matches(node, m.FunctionDef())
+    }
 
-            # Keep return annotation in `modular_xxx.py` if any, else original return annotation
-            new_return_annotation = updated_methods[name].returns if updated_methods[name].returns else func.returns
+    new_class_methods = []
+    # Iterate over the methods of the original modeling code, and add them to the list of methods to add
+    for name, node in original_modeling_methods.items():
+        # If the method was redefined in modular, make appropriate changes to the node
+        if name in modular_methods:
+            # Get the corresponding method node in modular
+            modular_node = modular_methods[name]
 
-            if not re.match(
-                r"\ndef .*\(.*\):\n    raise.*Error\(.*",
-                mapper.python_module.code_for_node(updated_methods[name]),
-            ):
-                func = func.with_changes(
-                    body=updated_methods[name].body,
-                    params=new_params,
-                    decorators=new_decorators,
-                    returns=new_return_annotation,
-                )
-            else:
+            # If we match the pattern, we should avoid inheriting the method
+            if re.match(r"\ndef .*\(.*\):\n    raise.*Error\(.*", mapper.python_module.code_for_node(modular_node)):
                 continue
 
-        if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
-            target = mapper.python_module.code_for_node(func.body[0].targets[0])
-            assign_targets[target] = func
-        elif m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
-            target = mapper.python_module.code_for_node(func.body[0].target)
-            assign_targets[target] = func
-        elif m.matches(func, DOCSTRING_NODE):
-            docstring_node = [func]
-        else:
-            end_meth.append(func)
+            # Compute new method docstring
+            modeling_docstring = [node_ for node_ in node.body.body if m.matches(node_, DOCSTRING_NODE)]
+            modular_docstring = [node_ for node_ in modular_node.body.body if m.matches(node_, DOCSTRING_NODE)]
+            # Use method docstring in modular if any, else original modeling code docstring
+            new_body = (
+                modular_node.body.body
+                if len(modular_docstring) > 0
+                else modeling_docstring + list(modular_node.body.body)
+            )
+            new_body = modular_node.body.with_changes(body=new_body)
+
+            # Use arguments as defined in the modular
+            new_params = modular_node.params
+
+            # If using the `**super_kwargs` syntax in modular, merge any existing modular arg with all the original modeling ones
+            kwarg_name = getattr(modular_node.params, "star_kwarg", None)
+            if kwarg_name and kwarg_name.name.value == "super_kwargs":
+                original_modeling_params = {k.name.value: k for k in node.params.params}
+                modular_params = {k.name.value: k for k in new_params.params[1:]}
+                new_param_list = list({**original_modeling_params, **modular_params}.values())
+                new_params = new_params.with_changes(params=new_param_list, star_kwarg=node.params.star_kwarg)
+
+            # Keep decorators in modular if any, else original decorators
+            new_decorators = modular_node.decorators if len(modular_node.decorators) > 0 else node.decorators
+
+            # Keep return annotation in modular if any, else original return annotation
+            new_return_annotation = modular_node.returns if modular_node.returns else node.returns
+
+            # Update the method node
+            node = node.with_changes(
+                body=new_body,
+                params=new_params,
+                decorators=new_decorators,
+                returns=new_return_annotation,
+            )
+
+        new_class_methods.append(node)
 
     # Port new methods that are defined only in modular-file and append at the end
-    for func in class_node.body.body:
-        name = func.name.value if hasattr(func, "name") else mapper.python_module.code_for_node(func)
-        if m.matches(func, DOCSTRING_NODE):  # This processes the docstring of the class!
-            # Extract the original docstring
-            updated_docstring = func.body[0].value.value
-            if len(docstring_node) == 0:  # If the original docstring is empty, just create one from the updated.
-                docstring_node = [
-                    cst.SimpleStatementLine(body=[cst.Expr(value=cst.SimpleString(value=updated_docstring))])
-                ]
-            else:
-                original_docstring = docstring_node[0].body[0].value.value
-                merged_doc = merge_docstrings(original_docstring, updated_docstring)
-                # Update the docstring in the original function
-                docstring_node = [
-                    docstring_node[0].with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])
-                ]
-        if name not in original_methods and func is not None and isinstance(func, cst.FunctionDef):
-            end_meth.append(func)
-        if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
-            # TODO we only use single assign might cause issues
-            target = mapper.python_module.code_for_node(func.body[0].targets[0])
-            assign_targets[target] = func
-        if m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
-            target = mapper.python_module.code_for_node(func.body[0].target)
-            assign_targets[target] = func
-    end_meth = docstring_node + list(assign_targets.values()) + end_meth
+    for name, node in modular_methods.items():
+        if name not in original_modeling_methods:
+            new_class_methods.append(node)
 
-    # Replace the calls to `super()` with the unrolled code
-    result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth))
+    # Recreate the whole new class body
+    new_class_body = new_class_docstring + new_class_attributes + new_class_methods
+
+    # Replace the calls to `super()` of the redefined modular methods with the unrolled code
+    result_node = original_modeling_node.with_changes(body=cst.IndentedBlock(body=new_class_body))
     temp_module = cst.Module(body=[result_node])
     new_module = MetadataWrapper(temp_module)
     new_replacement_class = new_module.visit(
-        SuperTransformer(temp_module, original_methods, updated_methods, all_bases)
+        SuperTransformer(temp_module, original_modeling_methods, modular_methods, all_bases)
     )
-    new_replacement_body = new_replacement_class.body[0].body  # get the indented block
+    new_class_body = new_replacement_class.body[0].body  # get the indented block
 
-    # Use decorators redefined in `modular_xxx.py` if any
-    new_decorators = class_node.decorators if len(class_node.decorators) > 0 else original_node.decorators
-
-    return original_node.with_changes(
-        body=new_replacement_body, decorators=new_decorators, bases=new_bases, name=new_name
+    return original_modeling_node.with_changes(
+        body=new_class_body, decorators=new_class_decorators, bases=new_class_bases, name=new_class_name
     )