From 920f34a77245adb2995e99f15f52591e1b8f3e97 Mon Sep 17 00:00:00 2001
From: Nikos Antoniou <57691096+nikosanto13@users.noreply.github.com>
Date: Tue, 21 Jan 2025 09:06:44 +0200
Subject: [PATCH] modular_model_converter bugfix on assignments (#35642)

* added bugfix in modular converter to keep modular assignments for docstrings, expected outputs etc.

* revert stracoder2 docstring copying, add forward in EMU3 to enable docstring assingment, remove verbatim assignments in modular converter

* added _FOR_DOC in assignments to keep, corrected wrong checkpoint name in ijepa's configuration
---
 .../models/bamba/modeling_bamba.py            |   1 +
 .../models/cohere/modeling_cohere.py          |   1 +
 .../models/cohere2/modeling_cohere2.py        |   1 +
 src/transformers/models/emu3/modeling_emu3.py | 166 +++++++++---------
 src/transformers/models/emu3/modular_emu3.py  |   4 +
 .../models/ijepa/configuration_ijepa.py       |   2 +-
 .../models/ijepa/modeling_ijepa.py            |   7 +-
 .../models/moonshine/modeling_moonshine.py    |   1 -
 src/transformers/models/phi/modeling_phi.py   |   2 +-
 src/transformers/models/phi/modular_phi.py    |   3 +
 .../models/starcoder2/modeling_starcoder2.py  |   1 +
 utils/modular_model_converter.py              |  22 +--
 12 files changed, 113 insertions(+), 98 deletions(-)

diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
index 3001718173..20a3247be2 100644
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@@ -61,6 +61,7 @@ else:
 
 
 logger = logging.get_logger(__name__)
+
 _CONFIG_FOR_DOC = "BambaConfig"
 
 
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 0d0d6ccf9e..9c7207adfc 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -52,6 +52,7 @@ from .configuration_cohere import CohereConfig
 
 
 logger = logging.get_logger(__name__)
+
 _CONFIG_FOR_DOC = "CohereConfig"
 
 
diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py
index 1dde941fe0..0b38c89d75 100644
--- a/src/transformers/models/cohere2/modeling_cohere2.py
+++ b/src/transformers/models/cohere2/modeling_cohere2.py
@@ -43,6 +43,7 @@ from .configuration_cohere2 import Cohere2Config
 
 
 logger = logging.get_logger(__name__)
+
 _CONFIG_FOR_DOC = "Cohere2Config"
 
 
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index dd8c333b9e..b42a222f6c 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -1257,7 +1257,7 @@ class Emu3RotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-EMU3_INPUTS_DOCSTRING = r"""
+EMU3_TEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1292,19 +1292,15 @@ EMU3_INPUTS_DOCSTRING = r"""
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
+            Has to be an instance of [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            The model will output the same cache type that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
@@ -1366,7 +1362,7 @@ class Emu3TextModel(Emu3PreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
-    @add_start_docstrings_to_model_forward(EMU3_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1598,77 +1594,6 @@ class Emu3TextModel(Emu3PreTrainedModel):
 class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
 
 
-EMU3_TEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Has to be an instance of [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
-
-            The model will output the same cache type that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
 class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
@@ -1790,6 +1715,85 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
         )
 
 
+EMU3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, max_num_images, max_num_tiles, channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+                The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Has to be an instance of [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
 class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["text_model.lm_head.weight"]
 
diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py
index ff57146939..aacf52fe31 100644
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@@ -1059,6 +1059,10 @@ class Emu3TextModel(LlamaModel, Emu3PreTrainedModel):
             [Emu3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
 
+    @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
 
 class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin):
     config_class = Emu3TextConfig
diff --git a/src/transformers/models/ijepa/configuration_ijepa.py b/src/transformers/models/ijepa/configuration_ijepa.py
index 338532e68d..e59920338a 100644
--- a/src/transformers/models/ijepa/configuration_ijepa.py
+++ b/src/transformers/models/ijepa/configuration_ijepa.py
@@ -22,7 +22,7 @@ class IJepaConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`IJepaModel`]. It is used to instantiate an IJEPA
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the I-JEPA
-    [google/ijepa-base-patch16-224](https://huggingface.co/google/ijepa-base-patch16-224) architecture.
+    [facebook/ijepa_vith14_1k](https://huggingface.co/facebook/ijepa_vith14_1k) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py
index df254455ba..e01290b089 100644
--- a/src/transformers/models/ijepa/modeling_ijepa.py
+++ b/src/transformers/models/ijepa/modeling_ijepa.py
@@ -527,7 +527,9 @@ IJEPA_INPUTS_DOCSTRING = r"""
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 256, 1280]
 
 
 IJEPA_START_DOCSTRING = r"""
@@ -640,8 +642,7 @@ class IJepaModel(IJepaPreTrainedModel):
         )
 
 
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/ijepa-base-patch16-224"
+_IMAGE_CLASS_CHECKPOINT = "facebook/ijepa_vith14_1k"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
 
 
diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py
index 2c50d760df..03d2b8d00d 100644
--- a/src/transformers/models/moonshine/modeling_moonshine.py
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@@ -49,7 +49,6 @@ from .configuration_moonshine import MoonshineConfig
 
 
 logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "MoonshineConfig"
 
 
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index c81fbbd013..33439dff75 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -36,7 +36,7 @@ from .configuration_phi import PhiConfig
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "meta-phi/Phi-2-7b-hf"
+_CHECKPOINT_FOR_DOC = "microsoft/phi-1"
 _CONFIG_FOR_DOC = "PhiConfig"
 
 
diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py
index d8480a7ad6..1b98d939bf 100644
--- a/src/transformers/models/phi/modular_phi.py
+++ b/src/transformers/models/phi/modular_phi.py
@@ -26,6 +26,9 @@ from .configuration_phi import PhiConfig
 
 logger = logging.get_logger(__name__)
 
+_CHECKPOINT_FOR_DOC = "microsoft/phi-1"
+_CONFIG_FOR_DOC = "PhiConfig"
+
 
 class PhiAttention(LlamaAttention):
     def __init__(self, config: PhiConfig, layer_idx: int):
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 8620c7d69d..500f96e3e3 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -56,6 +56,7 @@ from .configuration_starcoder2 import Starcoder2Config
 
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "bigcode/starcoder2-7b"
+
 _CONFIG_FOR_DOC = "Starcoder2Config"
 
 
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 8126d130ae..9730b6d2cb 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -268,7 +268,7 @@ def merge_docstrings(original_docstring, updated_docstring):
             updated_docstring = "".join(
                 [
                     parts[0].rstrip(" \n") + new_parts[0],
-                    f"\n{original_level*' '}```",
+                    f"\n{original_level * ' '}```",
                     parts[1],
                     "```",
                     parts[2],
@@ -515,10 +515,8 @@ def find_all_dependencies(
     return all_dependencies_with_parent
 
 
-# These top-level variables will always use the value in the `modular_xxx.py` file
-ASSIGNMENTS_TO_KEEP = {
-    "_CHECKPOINT_FOR_DOC",
-}
+# Top-level variables that match the following patterns will always use the value in the `modular_xxx.py` file
+ASSIGNMENTS_REGEX_TO_KEEP = [r"_CHECKPOINT", r"_EXPECTED", r"_FOR_DOC"]
 
 
 class ClassDependencyMapper(CSTVisitor):
@@ -828,12 +826,14 @@ class ModelFileMapper(ModuleMapper):
     def _merge_assignments(self, assignments: dict[str, cst.CSTNode], object_mapping: dict[str, set]):
         """Update the global nodes with the assignment from the modular file.
 
-        Merging rule: if any assignment with the same name was redefined in the modular, we use it and its dependencies ONLY if it is
-        in `ASSIGNMENTS_TO_KEEP`. Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the
+        Merging rule: if any assignment with the same name was redefined in the modular, we use it and its dependencies ONLY if it matches
+        a pattern in `ASSIGNMENTS_REGEX_TO_KEEP`. Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the
         big docstrings.
         """
         for assignment, node in assignments.items():
-            if assignment in ASSIGNMENTS_TO_KEEP or assignment not in self.assignments:
+            should_keep = any(re.search(pattern, assignment) for pattern in ASSIGNMENTS_REGEX_TO_KEEP)
+
+            if should_keep or assignment not in self.assignments:
                 self.assignments[assignment] = node
                 if assignment in object_mapping:
                     self.object_dependency_mapping[assignment] = object_mapping[assignment]
@@ -1404,7 +1404,7 @@ class ModularFileMapper(ModuleMapper):
             ]
             if len(modeling_bases) > 1:
                 raise ValueError(
-                    f"{class_name} was defined with more than 1 model-specific super class. This is unsupported. We found {*modeling_bases,}."
+                    f"{class_name} was defined with more than 1 model-specific super class. This is unsupported. We found {(*modeling_bases,)}."
                 )
             if len(modeling_bases) == 1:
                 filename = self.model_specific_imported_objects[modeling_bases[0]]
@@ -1432,7 +1432,7 @@ class ModularFileMapper(ModuleMapper):
             if final_name != cased_default_name and has_prefix_collision:
                 if len(prefixes_counter) > 1:
                     logger.warning(
-                        f"We detected multiple prefix names when inheriting from {file}: {*set(prefixes_counter),}. However, the "
+                        f"We detected multiple prefix names when inheriting from {file}: {(*set(prefixes_counter),)}. However, the "
                         f"most used one, '{final_name}', is already present in the source file and will likely cause consistency "
                         f"issues. For this reason we fallback to the default prefix '{cased_default_name}' when grabbing args "
                         "and dependencies. Make sure to subclass the intermediate classes with the prefix you want (if different "
@@ -1448,7 +1448,7 @@ class ModularFileMapper(ModuleMapper):
                 final_name = cased_default_name
             elif len(prefixes_counter) > 1:
                 logger.warning(
-                    f"We detected multiple prefix names when inheriting from {file}: {*set(prefixes_counter),}. We will only "
+                    f"We detected multiple prefix names when inheriting from {file}: {(*set(prefixes_counter),)}. We will only "
                     f"use the most used '{final_name}' prefix when grabbing args and dependencies. Make sure to subclass the "
                     f"intermediate classes with the prefix you want (if different from '{final_name}') or use a single prefix "
                     "in all the modular (best)."