Modular transformers: modularity and inheritance for new model additions (#33248)

* update exampel * update * push the converted diff files for testing and ci * correct one example * fix class attributes and docstring * nits * oups * fixed config! * update * nitd * class attributes are not matched against the other, this is missing * fixed overwriting self.xxx now onto the attributes I think * partial fix, now order with docstring * fix docstring order? * more fixes * update * fix missing docstrings! * examples don't all work yet * fixup * nit * updated * hick * update * delete * update * update * update * fix * all default * no local import * fix more diff * some fix related to "safe imports" * push fixed * add helper! * style * add a check * all by default * add the * update * FINALLY! * nit * fix config dependencies * man that is it * fix fix * update diffs * fix the last issue * re-default to all * alll the fixes * nice * fix properties vs setter * fixup * updates * update dependencies * make sure to install what needs to be installed * fixup * quick fix for now * fix! * fixup * update * update * updates * whitespaces * nit * fix * simplify everything, and make it file agnostic (should work for image processors) * style * finish fixing all import issues * fixup * empty modeling should not be written! * Add logic to find who depends on what * update * cleanup * update * update gemma to support positions * some small nits * this is the correct docstring for gemma2 * fix merging of docstrings * update * fixup * update * take doc into account * styling * update * fix hidden activation * more fixes * final fixes! * fixup * fixup instruct blip video * update * fix bugs * align gemma2 with the rest as well * updats * revert * update * more reversiom * grind * more * arf * update * order will matter * finish del stuff * update * rename to modular * fixup * nits * update makefile * fixup * update order of the checks! * fix * fix docstring that has a call inside * fiix conversion check * style * add some initial documentation * update * update doc * some fixup * updates * yups * Mostly todo gimme a minut * update * fixup * revert some stuff * Review docs for the modular transformers (#33472) Docs * good update * fixup * mmm current updates lead to this code * okay, this fixes it * cool * fixes * update * nit * updates * nits * fix doc * update * revert bad changes * update * updates * proper update * update * update? * up * update * cool * nits * nits * bon bon * fix * ? * minimise changes * update * update * update * updates? * fixed gemma2 * kind of a hack * nits * update * remove `diffs` in favor of `modular` * fix make fix copies --------- Co-authored-by: Lysandre Debut <hi@lysand.re>
2024-09-24 15:54:07 +02:00
parent 75b7485cc7
commit 317e069ee7
41 changed files with 6515 additions and 789 deletions
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -0,0 +1,76 @@
+import argparse
+import difflib
+import glob
+import logging
+from io import StringIO
+
+# Console for rich printing
+from modular_model_converter import convert_modular_file
+from rich.console import Console
+from rich.syntax import Syntax
+
+
+logging.basicConfig()
+logging.getLogger().setLevel(logging.ERROR)
+console = Console()
+
+
+def process_file(modular_file_path, generated_modeling_content, file_type="modeling_", fix_and_overwrite=False):
+    file_path = modular_file_path.replace("modular_", f"{file_type}_")
+    # Read the actual modeling file
+    with open(file_path, "r") as modeling_file:
+        content = modeling_file.read()
+    output_buffer = StringIO(generated_modeling_content[file_type][0])
+    output_buffer.seek(0)
+    output_content = output_buffer.read()
+    diff = difflib.unified_diff(
+        output_content.splitlines(),
+        content.splitlines(),
+        fromfile=f"{file_path}_generated",
+        tofile=f"{file_path}",
+        lineterm="",
+    )
+    diff_list = list(diff)
+    # Check for differences
+    if diff_list:
+        if fix_and_overwrite:
+            with open(file_path, "w") as modeling_file:
+                modeling_file.write(generated_modeling_content[file_type][0])
+            console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]")
+        else:
+            console.print(f"\n[bold red]Differences found between the generated code and {file_path}:[/bold red]\n")
+            diff_text = "\n".join(diff_list)
+            syntax = Syntax(diff_text, "diff", theme="ansi_dark", line_numbers=True)
+            console.print(syntax)
+        return 1
+    else:
+        console.print(f"[bold green]No differences found for {file_path}.[/bold green]")
+        return 0
+
+
+def compare_files(modular_file_path, fix_and_overwrite=False):
+    # Generate the expected modeling content
+    generated_modeling_content = convert_modular_file(modular_file_path)
+    diff = 0
+    for file_type in generated_modeling_content.keys():
+        diff += process_file(modular_file_path, generated_modeling_content, file_type, fix_and_overwrite)
+    return diff
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compare modular_xxx.py files with modeling_xxx.py files.")
+    parser.add_argument(
+        "--files", default=["all"], type=list, nargs="+", help="List of modular_xxx.py files to compare."
+    )
+    parser.add_argument(
+        "--fix_and_overwrite", action="store_true", help="Overwrite the modeling_xxx.py file if differences are found."
+    )
+    args = parser.parse_args()
+    if args.files == ["all"]:
+        args.files = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
+    non_matching_files = 0
+    for modular_file_path in args.files:
+        non_matching_files += compare_files(modular_file_path, args.fix_and_overwrite)
+
+    if non_matching_files and not args.fix_and_overwrite:
+        raise ValueError("Some diff and their modeling code did not match.")
--- a/utils/create_dependency_mapping.py
+++ b/utils/create_dependency_mapping.py
@@ -0,0 +1,69 @@
+import ast
+from collections import defaultdict, deque
+
+
+# Function to perform topological sorting
+def topological_sort(dependencies):
+    # Create a graph and in-degree count for each node
+    graph = defaultdict(list)
+    in_degree = defaultdict(int)
+
+    # Build the graph
+    for node, deps in dependencies.items():
+        for dep in deps:
+            graph[dep].append(node)  # node depends on dep
+            in_degree[node] += 1  # increase in-degree of node
+
+    # Add all nodes with zero in-degree to the queue
+    zero_in_degree_queue = deque([node for node in dependencies if in_degree[node] == 0])
+
+    sorted_list = []
+    # Perform topological sorting
+    while zero_in_degree_queue:
+        current = zero_in_degree_queue.popleft()
+        sorted_list.append(current)
+
+        # For each node that current points to, reduce its in-degree
+        for neighbor in graph[current]:
+            in_degree[neighbor] -= 1
+            if in_degree[neighbor] == 0:
+                zero_in_degree_queue.append(neighbor)
+
+    # Handle nodes that have no dependencies and were not initially part of the loop
+    for node in dependencies:
+        if node not in sorted_list:
+            sorted_list.append(node)
+
+    return sorted_list
+
+
+# Function to extract class and import info from a file
+def extract_classes_and_imports(file_path):
+    with open(file_path, "r") as file:
+        tree = ast.parse(file.read(), filename=file_path)
+    imports = set()
+
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.Import, ast.ImportFrom)):
+            module = node.module if isinstance(node, ast.ImportFrom) else None
+            if module and "transformers" in module:
+                imports.add(module)
+    return imports
+
+
+# Function to map dependencies between classes
+def map_dependencies(py_files):
+    dependencies = defaultdict(set)
+    # First pass: Extract all classes and map to files
+    for file_path in py_files:
+        dependencies[file_path].add(None)
+        class_to_file = extract_classes_and_imports(file_path)
+        for module in class_to_file:
+            dependencies[file_path].add(module)
+    return dependencies
+
+
+def find_priority_list(py_files):
+    dependencies = map_dependencies(py_files)
+    ordered_classes = topological_sort(dependencies)
+    return ordered_classes[::-1]
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -20,21 +20,23 @@ from typing import Dict

 import libcst as cst
 from check_copies import run_ruff
+from create_dependency_mapping import find_priority_list
 from libcst import ClassDef, CSTTransformer, CSTVisitor
 from libcst import matchers as m
 from libcst.metadata import MetadataWrapper, ParentNodeProvider, PositionProvider, ScopeProvider

 from transformers import logging
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES


 logger = logging.get_logger(__name__)


 AUTO_GENERATED_MESSAGE = """#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#               This file was automatically generated from <path_to_diff_file.py>.
+#               This file was automatically generated from <path_to_modular_file.py>.
 #         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the diff. If any change should be done, please apply the change to the
-#                                    diff.py file directly.
+#         the file from the modular. If any change should be done, please apply the change to the
+#                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 """

@@ -82,12 +84,16 @@ class ClassFinder(CSTVisitor):
        self.function_def = {}                          # stores global scope function definition
        self.assignments = {}                           # LLAMA_DOCSTRING
        self.class_dependency_mapping = {}              # "LlamaModel":["LlamaDecoderLayer, "LlamaRMSNorm", "LlamaPreTrainedModel"], "LlamaDecoderLayer":["LlamaAttention","Llama"]
+        self.first_lvl_dependency_mapping = {}              # "LlamaModel":["LlamaDecoderLayer, "LlamaRMSNorm", "LlamaPreTrainedModel"], "LlamaDecoderLayer":["LlamaAttention","Llama"]
        # fmt: on

    def _update_class_dependency(self, name, value):
        """Update the dependency mapping for `name` with `value` by appending the previous
        dependencies to the new `value`.
        """
+        dep = set(self.first_lvl_dependency_mapping.get(name, set())) | set({value})
+        self.first_lvl_dependency_mapping[name] = dep
+
        dep = set(self.class_dependency_mapping.get(value, set()))
        dep |= set(self.class_dependency_mapping.get(name, {})) | set({value})
        self.class_dependency_mapping[name] = dep
@@ -146,7 +152,16 @@ class ClassFinder(CSTVisitor):
    def leave_Decorator(self, node):
        if hasattr(node.decorator, "args"):
            for k in node.decorator.args:
-                if k.value.value in self.assignments:
+                if m.matches(k.value, m.Call(func=m.Attribute(value=m.Name()))):  # and k.value.func.value.value:
+                    if k.value.func.value.value not in self.assignments:
+                        raise ValueError(
+                            f"We detected a call to {k.value.func.value.value}, but it was not assigned. See the list of assigments {self.assignments.keys()}"
+                        )
+                    parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
+                    scope = self.get_metadata(cst.metadata.ScopeProvider, node)
+                    name = scope._name_prefix.split(".")[0] if scope._name_prefix != "" else parent.name.value
+                    self._update_class_dependency(name, k.value.func.value.value)
+                elif m.matches(k, m.Arg(value=m.Name())) and k.value.value in self.assignments:
                    parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
                    scope = self.get_metadata(cst.metadata.ScopeProvider, node)
                    name = scope._name_prefix.split(".")[0] if scope._name_prefix != "" else parent.name.value
@@ -178,6 +193,10 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
        self.old_name = old_name
        self.new_name = new_name
        self.default_name = "".join(x.title() for x in new_name.split("_"))
+        if self.new_name in CONFIG_MAPPING_NAMES:
+            self.default_name = CONFIG_MAPPING_NAMES[self.new_name].replace(
+                "Config", ""
+            )  # the best source of truth for class names. Could also just use the ones de
        self.patterns = {
            old_name: new_name,
            old_name.upper(): new_name.upper(),
@@ -193,7 +212,8 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):

        def replace(match):
            word = match.group(0)
-            return self.patterns.get(word, self.default_name)
+            result = self.patterns.get(word, self.default_name)
+            return result

        return compiled_regex.sub(replace, text)

@@ -227,35 +247,102 @@ DOCSTRING_NODE = m.SimpleStatementLine(
 )


+def SUPER_CALL_NODE(func_name):
+    return m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
+
+
+def get_docstring_indent(docstring):
+    # Match the first line after the opening triple quotes
+    match = re.search(r'(?:"""|\'\'\'|```)\n(\s+)', docstring)
+    if match:
+        # Return the indentation spaces captured
+        return len(match.group(1))
+    return 0
+
+
+def merge_docstrings(original_docstring, updated_docstring):
+    # indent_level = get_docstring_indent(updated_docstring)
+    original_level = get_docstring_indent(original_docstring)
+    if "        Args:\n        " not in updated_docstring:
+        # Split the docstring at the example section, assuming `"""` is used to define the docstring
+        parts = original_docstring.split("```")
+        if "```" in updated_docstring and len(parts) > 1:
+            updated_docstring = updated_docstring.lstrip('r"')
+            new_parts = updated_docstring.split("```")
+            if len(new_parts) != 3:
+                raise ValueError("There should only be one example, and it should have opening and closing '```'")
+            parts[1] = new_parts[1]
+            updated_docstring = "".join(
+                [
+                    parts[0].rstrip(" \n") + new_parts[0],
+                    f"\n{original_level*' '}```",
+                    parts[1],
+                    "```",
+                    parts[2],
+                ]
+            )
+        elif updated_docstring not in original_docstring:
+            # add tabulation if we are at the lowest level.
+            if re.search(r"\n\s*.*\(.*\)\:\n\s*\w", updated_docstring):
+                updated_docstring = updated_docstring.replace("\n    ", "\n        ")
+            updated_docstring = original_docstring.rstrip('"') + "\n" + updated_docstring.lstrip('r"\n')
+    return updated_docstring
+
+
 class SuperTransformer(cst.CSTTransformer):
    METADATA_DEPENDENCIES = (ParentNodeProvider,)

-    def __init__(self, python_module: cst.Module, original_methods, updated_methods):
+    def __init__(self, python_module: cst.Module, original_methods, updated_methods, class_name=""):
        self.python_module = python_module
        self.original_methods = original_methods
        self.updated_methods = updated_methods
+        self.all_assign_target = {}
+        self.deleted_targets = {}  # child node can delete some arguments
+        self.class_name = class_name

    def update_body(self, existing_body, new_statements):
        """
        Helper method to update the body by removing duplicates before adding new statements.
+        `existing_body` is the body of the original method, the parent class
+        `new_statements` are the additional statements
        """
        deduplicated_new_body = []
        existing_nodes = set()
+        for node in new_statements:
+            if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])):
+                target = self.python_module.code_for_node(node.body[0].targets[0].target)
+                self.all_assign_target[target] = node
+            if m.matches(node, m.SimpleStatementLine(body=[m.Del()])):
+                target = self.python_module.code_for_node(node.body[0].target)
+                self.deleted_targets[target] = node
+                continue
+
+        for stmt in existing_body:
+            if m.matches(stmt, m.SimpleStatementLine(body=[m.Assign()])):
+                target = self.python_module.code_for_node(stmt.body[0].targets[0].target)
+                if target in self.deleted_targets:
+                    logger.warning(f"Deleted the assign for {target}")
+                    continue
+                if target in self.all_assign_target:
+                    stmt = self.all_assign_target[target]
+            comment_less_code = re.sub(r"#.*", "", self.python_module.code_for_node(stmt)).strip()
+            comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
+            deduplicated_new_body.append(stmt)
+            existing_nodes.add(comment_less_code)
+
        for node in new_statements:
            code = self.python_module.code_for_node(node)
            comment_less_code = re.sub(r"#.*", "", code).strip()
            comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
-            existing_nodes.add(comment_less_code)
-        for stmt in existing_body:
-            comment_less_code = re.sub(r"#.*", "", self.python_module.code_for_node(stmt)).strip()
-            comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
-            if comment_less_code not in existing_nodes:
-                if m.matches(stmt, DOCSTRING_NODE) and self.has_docstring:
-                    continue
-                deduplicated_new_body.append(stmt)
-                existing_nodes.add(stmt)
-            else:
-                logger.info(f"\nFound duplicate {self.python_module.code_for_node(stmt)}")
+            if (
+                node not in deduplicated_new_body
+                and "super().__init__" not in comment_less_code
+                and comment_less_code not in existing_nodes
+            ):
+                if not m.matches(node, m.SimpleStatementLine(body=[m.Del()])):
+                    # HACK here to fix the pos_init() that has to be last we kinda do this.
+                    deduplicated_new_body = deduplicated_new_body[:-1] + [node] + deduplicated_new_body[-1:]
+                    existing_nodes.add(comment_less_code)
        return deduplicated_new_body

    def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CSTNode:
@@ -263,26 +350,37 @@ class SuperTransformer(cst.CSTTransformer):
        to super().func_name() with the source code of the parent class' `func_name`.
        It keeps everything that is defined before `super().func_name()`.
        """
-        new_body = []
        self.has_docstring = False
-        for expr in node.body:
-            self.has_docstring = m.matches(node.body[0], DOCSTRING_NODE)
+        parent_has_docstring = False
+        if func_name in self.original_methods:
+            parent_has_docstring = m.matches(self.original_methods[func_name].body.body[0], DOCSTRING_NODE)
+        new_body = []
+        has_super_call = False
+        for idx, expr in enumerate(node.body):
            if m.matches(
                expr,
                m.SimpleStatementLine(
-                    body=[
-                        m.Return(
-                            value=m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
-                        )
-                        | m.Expr(
-                            value=m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
-                        )
-                    ]
+                    body=[m.Return(SUPER_CALL_NODE(func_name)) | m.Expr(SUPER_CALL_NODE(func_name))]
                ),
            ):
+                if idx != 0 and func_name == "__init__":
+                    raise ValueError(f"The call to super() in {self.class_name} should be at the top of the init")
                new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body))
-            else:
+                has_super_call = True
+            elif m.matches(expr, DOCSTRING_NODE):
+                self.has_docstring = True
+                if parent_has_docstring:  # actually here we ought to de-duplicate?
+                    original_docstring = self.original_methods[func_name].body.body[0].body[0].value.value
+                    updated_docstring = expr.body[0].value.value
+                    merged_doc = merge_docstrings(original_docstring, updated_docstring)
+                    new_node = [expr.with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])]
+                else:
+                    new_node = [expr]
+                new_body.extend(new_node)
+            elif not m.matches(expr, m.SimpleStatementLine(body=[m.Del()])) and not has_super_call:
                new_body.append(expr)
+        if not self.has_docstring and parent_has_docstring:
+            new_body = [self.original_methods[func_name].body.body[0]] + new_body
        return node.with_changes(body=new_body)

    def leave_FunctionDef(self, original_node: cst.Call, updated_node: cst.Call) -> cst.CSTNode:
@@ -330,14 +428,22 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
                                                                            |     ```
    """
    original_node = class_finder.classes[class_name]
-    original_methods = {f.name.value if hasattr(f, "name") else f: f for f in original_node.body.body}
-    updated_methods = {f.name.value if hasattr(f, "name") else f: f for f in updated_node.body.body}
+    original_methods = {
+        f.name.value if hasattr(f, "name") else class_finder.python_module.code_for_node(f): f
+        for f in original_node.body.body
+    }
+    updated_methods = {
+        f.name.value if hasattr(f, "name") else class_finder.python_module.code_for_node(f): f
+        for f in updated_node.body.body
+    }
    end_meth = []

+    assign_targets = {}
+    docstring_node = []
    # Iterate directly from node.body as there can be property/setters with same names which are overwritten when we use a dict
    for func in original_node.body.body:
-        name = func.name.value if hasattr(func, "name") else func
-        if name in updated_methods and updated_methods[name] is not None:
+        name = func.name.value if hasattr(func, "name") else class_finder.python_module.code_for_node(func)
+        if m.matches(func, m.FunctionDef()) and name in updated_methods and updated_methods[name] is not None:
            new_params = updated_methods[name].params
            # Replace the method in the replacement class, preserving decorators
            kwarg_name = getattr(updated_methods[name].params, "star_kwarg", None)
@@ -348,22 +454,61 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
                    params=list(parent_params.values()), star_kwarg=func.params.star_kwarg
                )
            func = func.with_changes(body=updated_methods[name].body, params=new_params)
-        end_meth.append(func)
+        if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
+            target = class_finder.python_module.code_for_node(func.body[0].targets[0])
+            assign_targets[target] = func
+        elif m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
+            target = class_finder.python_module.code_for_node(func.body[0].target)
+            assign_targets[target] = func
+        elif m.matches(func, DOCSTRING_NODE):
+            docstring_node = [func]
+        else:
+            end_meth.append(func)

-    # Port new methods that are defined only in diff-file and append at the end
-    for name, func in updated_methods.items():
+    # Port new methods that are defined only in modular-file and append at the end
+    for func in updated_node.body.body:
+        name = func.name.value if hasattr(func, "name") else class_finder.python_module.code_for_node(func)
+        if m.matches(func, DOCSTRING_NODE):  # This processes the docstring of the class!
+            # Extract the original docstring
+            updated_docstring = func.body[0].value.value
+            original_docstring = docstring_node[0].body[0].value.value
+            merged_doc = merge_docstrings(original_docstring, updated_docstring)
+            # Update the docstring in the original function
+            docstring_node = [
+                docstring_node[0].with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])
+            ]
        if name not in original_methods and func is not None and isinstance(func, cst.FunctionDef):
            end_meth.append(func)
+        if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
+            # TODO we only use single assign might cause issues
+            target = class_finder.python_module.code_for_node(func.body[0].targets[0])
+            assign_targets[target] = func
+        if m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
+            target = class_finder.python_module.code_for_node(func.body[0].target)
+            assign_targets[target] = func
+    end_meth = docstring_node + list(assign_targets.values()) + end_meth

    result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth))
    temp_module = cst.Module(body=[result_node])
    new_module = MetadataWrapper(temp_module)
-    new_replacement_class = new_module.visit(SuperTransformer(temp_module, original_methods, updated_methods))
+    new_replacement_class = new_module.visit(
+        SuperTransformer(temp_module, original_methods, updated_methods, class_name)
+    )
    new_replacement_body = new_replacement_class.body[0].body  # get the indented block
+
    return original_node.with_changes(body=new_replacement_body)


-class DiffConverterTransformer(CSTTransformer):
+TYPE_TO_FILE_TYPE = {
+    "Config": "configuration",
+    "Tokenizer": "tokenization",
+    "Processor": "processor",
+    "ImageProcessor": "image_processing",
+    "FeatureExtractor": "feature_extractor",
+}
+
+
+class ModularConverterTransformer(CSTTransformer):
    METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)

    def __init__(self, python_module, new_name, given_old_name=None, given_new_name=None):
@@ -378,11 +523,21 @@ class DiffConverterTransformer(CSTTransformer):
        self.transformers_imports = {}      # maps the imports name like "from transformers.models.xxx" to the parsed AST module
        self.imported_mapping = {}          # stores the name of the imported classes, with their source {"LlamaModel":"transformers.model.llama.modeling_llama"}
        self.visited_module = {}            # modules visited like "transformers.models.llama.modeling_llama"
-        self.new_body = {}                  # store the new body, all global scope nodes should be added here
        self.inserted_deps = []             # nodes inserted via super dependency
        self.all_imports = []               # just stores all of the imports
+        self.all_safe_imports = []          # stores the import under simple statements
        self.global_scope_index = 0
        # fmt: on
+        self.files = {  # mapping for different component bodies
+            "modeling": {},
+            "configuration": {},
+            "tokenization": {},
+            "processing": {},
+            "image_processing": {},
+            "feature_extractor": {},
+        }
+        self.match_patterns = "|".join(self.files.keys())
+        self.all_functions = {}

    def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
        """When visiting imports from `transformers.models.xxx` we need to:
@@ -393,7 +548,7 @@ class DiffConverterTransformer(CSTTransformer):
        import_statement = self.python_module.code_for_node(node.module)
        if m.matches(node.module, m.Attribute()):
            for imported_ in node.names:
-                _import = re.search(r"transformers\.models\..*\.(modeling|configuration)_.*", import_statement)
+                _import = re.search(rf"(transformers\.models\..|..)*\.({self.match_patterns})_.*", import_statement)
                if _import:
                    source = _import.groups()[0]
                    if source == "modeling" and "Config" in self.python_module.code_for_node(imported_):
@@ -401,44 +556,38 @@ class DiffConverterTransformer(CSTTransformer):
                            f"You are importing {self.python_module.code_for_node(imported_)} from the modeling file. Import from the `configuration_xxxx.py` file instead"
                        )
                    if import_statement not in self.transformers_imports:
+                        if "models" not in import_statement:
+                            import_statement = "models." + import_statement
+                        if "transformers" not in import_statement:
+                            import_statement = "transformers." + import_statement
                        source_code = get_module_source_from_name(import_statement)
                        tree = cst.parse_module(source_code)
                        self.transformers_imports[import_statement] = tree
                    imported_class = self.python_module.code_for_node(imported_.name)
                    self.imported_mapping[imported_class] = import_statement
-
-    def leave_FunctionDef(self, original_node, node):
-        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
-        if m.matches(parent_node, m.Module()):
-            self.global_scope_index += 100
-            self.new_body[node.name.value] = {"insert_idx": self.global_scope_index, "node": node}
-        return node
+        if m.matches(node.module, m.Name()):
+            if "transformers" == import_statement:
+                raise ValueError(
+                    f"You are importing from {import_statement} directly using global imports. Import from the correct local path"
+                )

    def leave_SimpleStatementLine(self, original_node, updated_node):
        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
        if m.matches(parent_node, m.Module()):
            if m.matches(updated_node, m.SimpleStatementLine(body=[m.Import()])):
-                if parent_node not in self.all_imports:
+                if updated_node not in self.all_imports:
                    self.all_imports.append(updated_node)
                return updated_node
            elif m.matches(updated_node, m.SimpleStatementLine(body=[m.ImportFrom()])):
                full_statement = self.python_module.code_for_node(updated_node.body[0].module)
-                if re.search(r"transformers\.models\..*\.(modeling|configuration)_.*", full_statement):
+                if re.search(
+                    rf"(transformers\.models\..|..)*\.({self.match_patterns})_.*", full_statement
+                ):  # OR MATCH ..llama.modeling_llama
                    return cst.RemoveFromParent()
-                if parent_node not in self.all_imports:
+                if updated_node not in self.all_imports:
                    self.all_imports.append(updated_node)
                return updated_node
            self.global_scope_index += 100
-            if m.matches(updated_node, m.SimpleStatementLine(body=[m.Assign()])):
-                # TODO This only works for single target assigns!
-                node_name = updated_node.body[0].targets[0].target.value
-            else:
-                node_name = self.python_module.code_for_node(updated_node.body[0])
-            self.new_body[node_name] = {
-                "insert_idx": self.global_scope_index,
-                "node": updated_node,
-            }
-            self.config_body = [updated_node]
        return updated_node

    def leave_ClassDef(self, original_node, updated_node):
@@ -454,6 +603,7 @@ class DiffConverterTransformer(CSTTransformer):
        """
        class_name = original_node.name.value
        bases = [k.value.value for k in original_node.bases if k.value.value in self.imported_mapping]
+        all_bases = [k.value.value for k in original_node.bases]
        self.global_scope_index += 100
        for super_class in bases:
            if super_class not in self.imported_mapping:
@@ -469,7 +619,7 @@ class DiffConverterTransformer(CSTTransformer):
                raise ValueError(
                    f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name"
                )
-
+            file_type = re.search(r"models?\.\w*?\.(\w*?)_", super_file_name).groups()[0]
            visited_module = self.visited_module
            if super_file_name not in visited_module:  # only extract classes once
                class_finder = find_classes_in_file(
@@ -490,22 +640,47 @@ class DiffConverterTransformer(CSTTransformer):

            list_dependencies = sorted(list_dependencies.items(), key=lambda x: x[1], reverse=True)
            start_insert_idx = self.global_scope_index
+            file_to_update = self.files[file_type]
+            is_empty_node = self.python_module.code_for_node(original_node.body) == "pass\n"
            for dependency, _ in list_dependencies:
+                # we can write to the correct body, using the source of the parent class
                node = class_finder.global_nodes.get(dependency, None)
-                if node is not None and "Config" not in class_name:
-                    if dependency not in self.new_body:
+                if node is not None:
+                    if dependency not in file_to_update:
                        start_insert_idx -= 1
-                        self.new_body[dependency] = {"insert_idx": start_insert_idx, "node": node}
+                        file_to_update[dependency] = {"insert_idx": start_insert_idx, "node": node}
                    elif dependency not in self.inserted_deps:
                        # make sure the node is written after its dependencies
-                        start_insert_idx = self.new_body[dependency]["insert_idx"] - 1
+                        start_insert_idx = file_to_update[dependency]["insert_idx"] - 1
+                        if (
+                            dependency in file_to_update.keys()
+                            and dependency in class_finder.first_lvl_dependency_mapping[class_name]
+                        ):
+                            # If dependency is defined, but not used, raise error
+                            calls = m.findall(original_node, m.Call(func=m.Name(dependency)))
+                            if not calls and not is_empty_node and dependency not in all_bases:
+                                raise ValueError(
+                                    f"""You defined `{dependency}` in the modular_{self.model_name}.py, it should be used
+                                    when you define `{class_name}`, as it is one of it's direct dependencies. Make sure
+                                    you use it in the `__init__` function."""
+                                )
                    self.inserted_deps.append(dependency)
+
            if len(list_dependencies) > 0:
                updated_node = replace_call_to_super(class_finder, updated_node, class_name)
-        if "Config" in class_name:
-            self.config_body += [updated_node]
+            else:
+                raise ValueError(
+                    f"Unable to find dependencies for {super_class} in {super_file_name}. Here are the dependencies found: {class_finder.class_dependency_mapping}. (The automatic renaming might have gone wrong!)"
+                )
+
+        # Now, if a class was defined without parents, we look for the name
+        match_pattern = "|".join(TYPE_TO_FILE_TYPE.keys())
+        match = re.search(rf"({match_pattern})$", class_name)
+        if match:
+            key = TYPE_TO_FILE_TYPE[match.group(1)]
+            self.files[key][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
        else:
-            self.new_body[class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
+            self.files["modeling"][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
        return updated_node

    def leave_If(self, original_node, node):
@@ -513,66 +688,69 @@ class DiffConverterTransformer(CSTTransformer):
        if m.matches(parent_node, m.Module()):
            full_statement = self.python_module.code_for_node(original_node.test)
            if re.search(r"[\s\S]*is_.*available", full_statement):
-                self.all_imports.append(node)
+                self.all_safe_imports.append(node)
            elif full_statement not in self.new_body:
                self.new_body[node] = {"insert_idx": self.global_scope_index, "node": node}
        return node

    def leave_Module(self, original_node: cst.Assign, node):
        imports = {self.python_module.code_for_node(k): k for k in self.all_imports}
-        dependency_imports = {}
-        config_imports = []
-        for visiter in self.visited_module.values():
-            dependency_imports.update({self.python_module.code_for_node(k): k for k in visiter.imports.values()})
+        dependency_imports = {file_type: imports.copy() for file_type in self.files}
+        for super_file_name, visiter in self.visited_module.items():
+            file_type = re.search(r"models?\.\w*?\.(\w*?)_", super_file_name).groups()[0]
+            dependency_imports[file_type].update(
+                {self.python_module.code_for_node(k): k for k in visiter.imports.values()}
+            )

-        # manually clean up if it's importing a config from configuration file (ruff doesn't do that)
-        config_imports = []
-        for i in list(dependency_imports.values()):
-            if (
-                hasattr(i.body[0], "module")
-                and isinstance(i.body[0].module, cst.Name)
-                and f"configuration_{self.model_name}" in i.body[0].module.value
-            ):
-                pass
-            else:
-                config_imports.append(i)
+        for file, body in self.files.items():
+            new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
+            if len(new_body) > 0:
+                if file in dependency_imports.keys():
+                    new_body = list(dependency_imports[file].values()) + new_body
+                self.files[file] = cst.Module(body=[*new_body], header=node.header)
+        return node

-        if hasattr(self, "config_body"):
-            self.config_body = list(imports.values()) + config_imports + self.config_body
-        dependency_imports.update(imports)
-        new_body = list(dependency_imports.values())
-        if len(self.new_body.keys()) > 0:
-            new_body += [k[1]["node"] for k in sorted(self.new_body.items(), key=lambda x: x[1]["insert_idx"])]
+
+def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, cst_transformers=None):
+    pattern = re.search(r"modular_(.*)(?=\.py$)", modular_file)
+    output = {}
+    if pattern is not None:
+        model_name = pattern.groups()[0]
+        # Parse the Python file
+        with open(modular_file, "r") as file:
+            code = file.read()
+        module = cst.parse_module(code)
+        wrapper = MetadataWrapper(module)
+        if cst_transformers is None:
+            cst_transformers = ModularConverterTransformer(module, model_name, old_model_name, new_model_name)
+        wrapper.visit(cst_transformers)
+        for file, node in cst_transformers.files.items():
+            if node != {}:
+                ruffed_code = run_ruff(AUTO_GENERATED_MESSAGE + node.code, True)
+                formatted_code = run_ruff(ruffed_code, False)
+                output[file] = [formatted_code, ruffed_code]
+        return output
+    else:
+        print(f"modular pattern not found in {modular_file}, exiting")
+        return {}
+
+
+def save_modeling_file(modular_file, converted_file):
+    for file_type in converted_file.keys():
+        non_comment_lines = len(
+            [line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
+        )
+        if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0:
+            with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f:
+                f.write(converted_file[file_type][0])
        else:
-            new_body = []
-        return node.with_changes(body=[*new_body])
-
-
-def convert_file(diff_file, old_model_name=None, new_model_name=None, cst_transformers=None):
-    model_name = re.search(r"diff_(.*)(?=\.py$)", diff_file).groups()[0]
-    # Parse the Python file
-    with open(diff_file, "r") as file:
-        code = file.read()
-    module = cst.parse_module(code)
-    wrapper = MetadataWrapper(module)
-    if cst_transformers is None:
-        cst_transformers = DiffConverterTransformer(module, model_name, old_model_name, new_model_name)
-    new_mod = wrapper.visit(cst_transformers)
-    ruffed_code = run_ruff(new_mod.code, True)
-    formatted_code = run_ruff(ruffed_code, False)
-    if len(formatted_code.strip()) > 0:
-        with open(diff_file.replace("diff_", "modeling_"), "w") as f:
-            f.write(AUTO_GENERATED_MESSAGE + formatted_code)
-
-    if hasattr(cst_transformers, "config_body"):
-        config_module = cst.Module(body=[*cst_transformers.config_body], header=new_mod.header)
-        with open(diff_file.replace("diff_", "configuration_"), "w") as f:
-            ruffed_code = run_ruff(config_module.code, True)
-            formatted_code = run_ruff(ruffed_code, False)
-            f.write(AUTO_GENERATED_MESSAGE + formatted_code)
-
-    # TODO optimize by re-using the class_finder
-    return cst_transformers
+            non_comment_lines = len(
+                [line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
+            )
+            if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0:
+                logger.warning("The modeling code contains errors, it's written without formatting")
+                with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f:
+                    f.write(converted_file[file_type][1])


 if __name__ == "__main__":
@@ -581,22 +759,24 @@ if __name__ == "__main__":
        "--files_to_parse",
        default=["all"],
        nargs="+",
-        help="A list of `diff_xxxx` files that should be converted to single model file",
+        help="A list of `modular_xxxx` files that should be converted to single model file",
    )
    parser.add_argument(
        "--old_model_name",
        required=False,
-        help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from diff-file",
+        help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from modular-file",
    )
    parser.add_argument(
        "--new_model_name",
        required=False,
-        help="The name of the new model being added in CamelCase. If not provided is inferred from diff-file",
+        help="The name of the new model being added in CamelCase. If not provided is inferred from modular-file",
    )
    args = parser.parse_args()
    if args.files_to_parse == ["all"]:
-        args.files_to_parse = glob.glob("src/transformers/models/**/diff_*.py", recursive=True)
-    for file_name in args.files_to_parse:
+        args.files_to_parse = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
+
+    for file_name in find_priority_list(args.files_to_parse):
        print(f"Converting {file_name} to a single model single file format")
        module_path = file_name.replace("/", ".").replace(".py", "").replace("src.", "")
-        converter = convert_file(file_name, args.old_model_name, args.new_model_name)
+        converted_files = convert_modular_file(file_name, args.old_model_name, args.new_model_name)
+        converter = save_modeling_file(file_name, converted_files)