Modular transformers: modularity and inheritance for new model additions (#33248)

* update exampel

* update

* push the converted diff files for testing and ci

* correct one example

* fix class attributes and docstring

* nits

* oups

* fixed config!

* update

* nitd

* class attributes are not matched against the other, this is missing

* fixed overwriting self.xxx now onto the attributes I think

* partial fix, now order with docstring

* fix docstring order?

* more fixes

* update

* fix missing docstrings!

* examples don't all work yet

* fixup

* nit

* updated

* hick

* update

* delete

* update

* update

* update

* fix

* all default

* no local import

* fix more diff

* some fix related to "safe imports"

* push fixed

* add helper!

* style

* add a check

* all by default

* add the

* update

* FINALLY!

* nit

* fix config dependencies

* man that is it

* fix fix

* update diffs

* fix the last issue

* re-default to all

* alll the fixes

* nice

* fix properties vs setter

* fixup

* updates

* update dependencies

* make sure to install what needs to be installed

* fixup

* quick fix for now

* fix!

* fixup

* update

* update

* updates

* whitespaces

* nit

* fix

* simplify everything, and make it file agnostic (should work for image processors)

* style

* finish fixing all import issues

* fixup

* empty modeling should not be written!

* Add logic to find who depends on what

* update

* cleanup

* update

* update gemma to support positions

* some small nits

* this is the correct docstring for gemma2

* fix merging of docstrings

* update

* fixup

* update

* take doc into account

* styling

* update

* fix hidden activation

* more fixes

* final fixes!

* fixup

* fixup instruct  blip video

* update

* fix bugs

* align gemma2 with the rest as well

* updats

* revert

* update

* more reversiom

* grind

* more

* arf

* update

* order will matter

* finish del stuff

* update

* rename to modular

* fixup

* nits

* update makefile

* fixup

* update order of the checks!

* fix

* fix docstring that has a call inside

* fiix conversion check

* style

* add some initial documentation

* update

* update doc

* some fixup

* updates

* yups

* Mostly todo gimme a minut

* update

* fixup

* revert some stuff

* Review docs for the modular transformers (#33472)

Docs

* good update

* fixup

* mmm current updates lead to this code

* okay, this fixes it

* cool

* fixes

* update

* nit

* updates

* nits

* fix doc

* update

* revert bad changes

* update

* updates

* proper update

* update

* update?

* up

* update

* cool

* nits

* nits

* bon bon

* fix

* ?

* minimise changes

* update

* update

* update

* updates?

* fixed gemma2

* kind of a hack

* nits

* update

* remove `diffs` in favor of `modular`

* fix make fix copies

---------

Co-authored-by: Lysandre Debut <hi@lysand.re>
This commit is contained in:
Arthur
2024-09-24 15:54:07 +02:00
committed by GitHub
parent 75b7485cc7
commit 317e069ee7
41 changed files with 6515 additions and 789 deletions

View File

@@ -0,0 +1,76 @@
import argparse
import difflib
import glob
import logging
from io import StringIO
# Console for rich printing
from modular_model_converter import convert_modular_file
from rich.console import Console
from rich.syntax import Syntax
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)
console = Console()
def process_file(modular_file_path, generated_modeling_content, file_type="modeling_", fix_and_overwrite=False):
file_path = modular_file_path.replace("modular_", f"{file_type}_")
# Read the actual modeling file
with open(file_path, "r") as modeling_file:
content = modeling_file.read()
output_buffer = StringIO(generated_modeling_content[file_type][0])
output_buffer.seek(0)
output_content = output_buffer.read()
diff = difflib.unified_diff(
output_content.splitlines(),
content.splitlines(),
fromfile=f"{file_path}_generated",
tofile=f"{file_path}",
lineterm="",
)
diff_list = list(diff)
# Check for differences
if diff_list:
if fix_and_overwrite:
with open(file_path, "w") as modeling_file:
modeling_file.write(generated_modeling_content[file_type][0])
console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]")
else:
console.print(f"\n[bold red]Differences found between the generated code and {file_path}:[/bold red]\n")
diff_text = "\n".join(diff_list)
syntax = Syntax(diff_text, "diff", theme="ansi_dark", line_numbers=True)
console.print(syntax)
return 1
else:
console.print(f"[bold green]No differences found for {file_path}.[/bold green]")
return 0
def compare_files(modular_file_path, fix_and_overwrite=False):
# Generate the expected modeling content
generated_modeling_content = convert_modular_file(modular_file_path)
diff = 0
for file_type in generated_modeling_content.keys():
diff += process_file(modular_file_path, generated_modeling_content, file_type, fix_and_overwrite)
return diff
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compare modular_xxx.py files with modeling_xxx.py files.")
parser.add_argument(
"--files", default=["all"], type=list, nargs="+", help="List of modular_xxx.py files to compare."
)
parser.add_argument(
"--fix_and_overwrite", action="store_true", help="Overwrite the modeling_xxx.py file if differences are found."
)
args = parser.parse_args()
if args.files == ["all"]:
args.files = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
non_matching_files = 0
for modular_file_path in args.files:
non_matching_files += compare_files(modular_file_path, args.fix_and_overwrite)
if non_matching_files and not args.fix_and_overwrite:
raise ValueError("Some diff and their modeling code did not match.")

View File

@@ -0,0 +1,69 @@
import ast
from collections import defaultdict, deque
# Function to perform topological sorting
def topological_sort(dependencies):
# Create a graph and in-degree count for each node
graph = defaultdict(list)
in_degree = defaultdict(int)
# Build the graph
for node, deps in dependencies.items():
for dep in deps:
graph[dep].append(node) # node depends on dep
in_degree[node] += 1 # increase in-degree of node
# Add all nodes with zero in-degree to the queue
zero_in_degree_queue = deque([node for node in dependencies if in_degree[node] == 0])
sorted_list = []
# Perform topological sorting
while zero_in_degree_queue:
current = zero_in_degree_queue.popleft()
sorted_list.append(current)
# For each node that current points to, reduce its in-degree
for neighbor in graph[current]:
in_degree[neighbor] -= 1
if in_degree[neighbor] == 0:
zero_in_degree_queue.append(neighbor)
# Handle nodes that have no dependencies and were not initially part of the loop
for node in dependencies:
if node not in sorted_list:
sorted_list.append(node)
return sorted_list
# Function to extract class and import info from a file
def extract_classes_and_imports(file_path):
with open(file_path, "r") as file:
tree = ast.parse(file.read(), filename=file_path)
imports = set()
for node in ast.walk(tree):
if isinstance(node, (ast.Import, ast.ImportFrom)):
module = node.module if isinstance(node, ast.ImportFrom) else None
if module and "transformers" in module:
imports.add(module)
return imports
# Function to map dependencies between classes
def map_dependencies(py_files):
dependencies = defaultdict(set)
# First pass: Extract all classes and map to files
for file_path in py_files:
dependencies[file_path].add(None)
class_to_file = extract_classes_and_imports(file_path)
for module in class_to_file:
dependencies[file_path].add(module)
return dependencies
def find_priority_list(py_files):
dependencies = map_dependencies(py_files)
ordered_classes = topological_sort(dependencies)
return ordered_classes[::-1]

View File

@@ -20,21 +20,23 @@ from typing import Dict
import libcst as cst
from check_copies import run_ruff
from create_dependency_mapping import find_priority_list
from libcst import ClassDef, CSTTransformer, CSTVisitor
from libcst import matchers as m
from libcst.metadata import MetadataWrapper, ParentNodeProvider, PositionProvider, ScopeProvider
from transformers import logging
from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES
logger = logging.get_logger(__name__)
AUTO_GENERATED_MESSAGE = """# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from <path_to_diff_file.py>.
# This file was automatically generated from <path_to_modular_file.py>.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the diff. If any change should be done, please apply the change to the
# diff.py file directly.
# the file from the modular. If any change should be done, please apply the change to the
# modular_xxx.py file directly. One of our CI enforces this
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
"""
@@ -82,12 +84,16 @@ class ClassFinder(CSTVisitor):
self.function_def = {} # stores global scope function definition
self.assignments = {} # LLAMA_DOCSTRING
self.class_dependency_mapping = {} # "LlamaModel":["LlamaDecoderLayer, "LlamaRMSNorm", "LlamaPreTrainedModel"], "LlamaDecoderLayer":["LlamaAttention","Llama"]
self.first_lvl_dependency_mapping = {} # "LlamaModel":["LlamaDecoderLayer, "LlamaRMSNorm", "LlamaPreTrainedModel"], "LlamaDecoderLayer":["LlamaAttention","Llama"]
# fmt: on
def _update_class_dependency(self, name, value):
"""Update the dependency mapping for `name` with `value` by appending the previous
dependencies to the new `value`.
"""
dep = set(self.first_lvl_dependency_mapping.get(name, set())) | set({value})
self.first_lvl_dependency_mapping[name] = dep
dep = set(self.class_dependency_mapping.get(value, set()))
dep |= set(self.class_dependency_mapping.get(name, {})) | set({value})
self.class_dependency_mapping[name] = dep
@@ -146,7 +152,16 @@ class ClassFinder(CSTVisitor):
def leave_Decorator(self, node):
if hasattr(node.decorator, "args"):
for k in node.decorator.args:
if k.value.value in self.assignments:
if m.matches(k.value, m.Call(func=m.Attribute(value=m.Name()))): # and k.value.func.value.value:
if k.value.func.value.value not in self.assignments:
raise ValueError(
f"We detected a call to {k.value.func.value.value}, but it was not assigned. See the list of assigments {self.assignments.keys()}"
)
parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
scope = self.get_metadata(cst.metadata.ScopeProvider, node)
name = scope._name_prefix.split(".")[0] if scope._name_prefix != "" else parent.name.value
self._update_class_dependency(name, k.value.func.value.value)
elif m.matches(k, m.Arg(value=m.Name())) and k.value.value in self.assignments:
parent = self.get_metadata(cst.metadata.ParentNodeProvider, node)
scope = self.get_metadata(cst.metadata.ScopeProvider, node)
name = scope._name_prefix.split(".")[0] if scope._name_prefix != "" else parent.name.value
@@ -178,6 +193,10 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
self.old_name = old_name
self.new_name = new_name
self.default_name = "".join(x.title() for x in new_name.split("_"))
if self.new_name in CONFIG_MAPPING_NAMES:
self.default_name = CONFIG_MAPPING_NAMES[self.new_name].replace(
"Config", ""
) # the best source of truth for class names. Could also just use the ones de
self.patterns = {
old_name: new_name,
old_name.upper(): new_name.upper(),
@@ -193,7 +212,8 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
def replace(match):
word = match.group(0)
return self.patterns.get(word, self.default_name)
result = self.patterns.get(word, self.default_name)
return result
return compiled_regex.sub(replace, text)
@@ -227,35 +247,102 @@ DOCSTRING_NODE = m.SimpleStatementLine(
)
def SUPER_CALL_NODE(func_name):
return m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
def get_docstring_indent(docstring):
# Match the first line after the opening triple quotes
match = re.search(r'(?:"""|\'\'\'|```)\n(\s+)', docstring)
if match:
# Return the indentation spaces captured
return len(match.group(1))
return 0
def merge_docstrings(original_docstring, updated_docstring):
# indent_level = get_docstring_indent(updated_docstring)
original_level = get_docstring_indent(original_docstring)
if " Args:\n " not in updated_docstring:
# Split the docstring at the example section, assuming `"""` is used to define the docstring
parts = original_docstring.split("```")
if "```" in updated_docstring and len(parts) > 1:
updated_docstring = updated_docstring.lstrip('r"')
new_parts = updated_docstring.split("```")
if len(new_parts) != 3:
raise ValueError("There should only be one example, and it should have opening and closing '```'")
parts[1] = new_parts[1]
updated_docstring = "".join(
[
parts[0].rstrip(" \n") + new_parts[0],
f"\n{original_level*' '}```",
parts[1],
"```",
parts[2],
]
)
elif updated_docstring not in original_docstring:
# add tabulation if we are at the lowest level.
if re.search(r"\n\s*.*\(.*\)\:\n\s*\w", updated_docstring):
updated_docstring = updated_docstring.replace("\n ", "\n ")
updated_docstring = original_docstring.rstrip('"') + "\n" + updated_docstring.lstrip('r"\n')
return updated_docstring
class SuperTransformer(cst.CSTTransformer):
METADATA_DEPENDENCIES = (ParentNodeProvider,)
def __init__(self, python_module: cst.Module, original_methods, updated_methods):
def __init__(self, python_module: cst.Module, original_methods, updated_methods, class_name=""):
self.python_module = python_module
self.original_methods = original_methods
self.updated_methods = updated_methods
self.all_assign_target = {}
self.deleted_targets = {} # child node can delete some arguments
self.class_name = class_name
def update_body(self, existing_body, new_statements):
"""
Helper method to update the body by removing duplicates before adding new statements.
`existing_body` is the body of the original method, the parent class
`new_statements` are the additional statements
"""
deduplicated_new_body = []
existing_nodes = set()
for node in new_statements:
if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])):
target = self.python_module.code_for_node(node.body[0].targets[0].target)
self.all_assign_target[target] = node
if m.matches(node, m.SimpleStatementLine(body=[m.Del()])):
target = self.python_module.code_for_node(node.body[0].target)
self.deleted_targets[target] = node
continue
for stmt in existing_body:
if m.matches(stmt, m.SimpleStatementLine(body=[m.Assign()])):
target = self.python_module.code_for_node(stmt.body[0].targets[0].target)
if target in self.deleted_targets:
logger.warning(f"Deleted the assign for {target}")
continue
if target in self.all_assign_target:
stmt = self.all_assign_target[target]
comment_less_code = re.sub(r"#.*", "", self.python_module.code_for_node(stmt)).strip()
comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
deduplicated_new_body.append(stmt)
existing_nodes.add(comment_less_code)
for node in new_statements:
code = self.python_module.code_for_node(node)
comment_less_code = re.sub(r"#.*", "", code).strip()
comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
existing_nodes.add(comment_less_code)
for stmt in existing_body:
comment_less_code = re.sub(r"#.*", "", self.python_module.code_for_node(stmt)).strip()
comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip()
if comment_less_code not in existing_nodes:
if m.matches(stmt, DOCSTRING_NODE) and self.has_docstring:
continue
deduplicated_new_body.append(stmt)
existing_nodes.add(stmt)
else:
logger.info(f"\nFound duplicate {self.python_module.code_for_node(stmt)}")
if (
node not in deduplicated_new_body
and "super().__init__" not in comment_less_code
and comment_less_code not in existing_nodes
):
if not m.matches(node, m.SimpleStatementLine(body=[m.Del()])):
# HACK here to fix the pos_init() that has to be last we kinda do this.
deduplicated_new_body = deduplicated_new_body[:-1] + [node] + deduplicated_new_body[-1:]
existing_nodes.add(comment_less_code)
return deduplicated_new_body
def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CSTNode:
@@ -263,26 +350,37 @@ class SuperTransformer(cst.CSTTransformer):
to super().func_name() with the source code of the parent class' `func_name`.
It keeps everything that is defined before `super().func_name()`.
"""
new_body = []
self.has_docstring = False
for expr in node.body:
self.has_docstring = m.matches(node.body[0], DOCSTRING_NODE)
parent_has_docstring = False
if func_name in self.original_methods:
parent_has_docstring = m.matches(self.original_methods[func_name].body.body[0], DOCSTRING_NODE)
new_body = []
has_super_call = False
for idx, expr in enumerate(node.body):
if m.matches(
expr,
m.SimpleStatementLine(
body=[
m.Return(
value=m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
)
| m.Expr(
value=m.Call(func=m.Attribute(value=m.Call(func=m.Name("super")), attr=m.Name(func_name)))
)
]
body=[m.Return(SUPER_CALL_NODE(func_name)) | m.Expr(SUPER_CALL_NODE(func_name))]
),
):
if idx != 0 and func_name == "__init__":
raise ValueError(f"The call to super() in {self.class_name} should be at the top of the init")
new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body))
else:
has_super_call = True
elif m.matches(expr, DOCSTRING_NODE):
self.has_docstring = True
if parent_has_docstring: # actually here we ought to de-duplicate?
original_docstring = self.original_methods[func_name].body.body[0].body[0].value.value
updated_docstring = expr.body[0].value.value
merged_doc = merge_docstrings(original_docstring, updated_docstring)
new_node = [expr.with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])]
else:
new_node = [expr]
new_body.extend(new_node)
elif not m.matches(expr, m.SimpleStatementLine(body=[m.Del()])) and not has_super_call:
new_body.append(expr)
if not self.has_docstring and parent_has_docstring:
new_body = [self.original_methods[func_name].body.body[0]] + new_body
return node.with_changes(body=new_body)
def leave_FunctionDef(self, original_node: cst.Call, updated_node: cst.Call) -> cst.CSTNode:
@@ -330,14 +428,22 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
| ```
"""
original_node = class_finder.classes[class_name]
original_methods = {f.name.value if hasattr(f, "name") else f: f for f in original_node.body.body}
updated_methods = {f.name.value if hasattr(f, "name") else f: f for f in updated_node.body.body}
original_methods = {
f.name.value if hasattr(f, "name") else class_finder.python_module.code_for_node(f): f
for f in original_node.body.body
}
updated_methods = {
f.name.value if hasattr(f, "name") else class_finder.python_module.code_for_node(f): f
for f in updated_node.body.body
}
end_meth = []
assign_targets = {}
docstring_node = []
# Iterate directly from node.body as there can be property/setters with same names which are overwritten when we use a dict
for func in original_node.body.body:
name = func.name.value if hasattr(func, "name") else func
if name in updated_methods and updated_methods[name] is not None:
name = func.name.value if hasattr(func, "name") else class_finder.python_module.code_for_node(func)
if m.matches(func, m.FunctionDef()) and name in updated_methods and updated_methods[name] is not None:
new_params = updated_methods[name].params
# Replace the method in the replacement class, preserving decorators
kwarg_name = getattr(updated_methods[name].params, "star_kwarg", None)
@@ -348,22 +454,61 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
params=list(parent_params.values()), star_kwarg=func.params.star_kwarg
)
func = func.with_changes(body=updated_methods[name].body, params=new_params)
end_meth.append(func)
if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
target = class_finder.python_module.code_for_node(func.body[0].targets[0])
assign_targets[target] = func
elif m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
target = class_finder.python_module.code_for_node(func.body[0].target)
assign_targets[target] = func
elif m.matches(func, DOCSTRING_NODE):
docstring_node = [func]
else:
end_meth.append(func)
# Port new methods that are defined only in diff-file and append at the end
for name, func in updated_methods.items():
# Port new methods that are defined only in modular-file and append at the end
for func in updated_node.body.body:
name = func.name.value if hasattr(func, "name") else class_finder.python_module.code_for_node(func)
if m.matches(func, DOCSTRING_NODE): # This processes the docstring of the class!
# Extract the original docstring
updated_docstring = func.body[0].value.value
original_docstring = docstring_node[0].body[0].value.value
merged_doc = merge_docstrings(original_docstring, updated_docstring)
# Update the docstring in the original function
docstring_node = [
docstring_node[0].with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])
]
if name not in original_methods and func is not None and isinstance(func, cst.FunctionDef):
end_meth.append(func)
if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])):
# TODO we only use single assign might cause issues
target = class_finder.python_module.code_for_node(func.body[0].targets[0])
assign_targets[target] = func
if m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])):
target = class_finder.python_module.code_for_node(func.body[0].target)
assign_targets[target] = func
end_meth = docstring_node + list(assign_targets.values()) + end_meth
result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth))
temp_module = cst.Module(body=[result_node])
new_module = MetadataWrapper(temp_module)
new_replacement_class = new_module.visit(SuperTransformer(temp_module, original_methods, updated_methods))
new_replacement_class = new_module.visit(
SuperTransformer(temp_module, original_methods, updated_methods, class_name)
)
new_replacement_body = new_replacement_class.body[0].body # get the indented block
return original_node.with_changes(body=new_replacement_body)
class DiffConverterTransformer(CSTTransformer):
TYPE_TO_FILE_TYPE = {
"Config": "configuration",
"Tokenizer": "tokenization",
"Processor": "processor",
"ImageProcessor": "image_processing",
"FeatureExtractor": "feature_extractor",
}
class ModularConverterTransformer(CSTTransformer):
METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider)
def __init__(self, python_module, new_name, given_old_name=None, given_new_name=None):
@@ -378,11 +523,21 @@ class DiffConverterTransformer(CSTTransformer):
self.transformers_imports = {} # maps the imports name like "from transformers.models.xxx" to the parsed AST module
self.imported_mapping = {} # stores the name of the imported classes, with their source {"LlamaModel":"transformers.model.llama.modeling_llama"}
self.visited_module = {} # modules visited like "transformers.models.llama.modeling_llama"
self.new_body = {} # store the new body, all global scope nodes should be added here
self.inserted_deps = [] # nodes inserted via super dependency
self.all_imports = [] # just stores all of the imports
self.all_safe_imports = [] # stores the import under simple statements
self.global_scope_index = 0
# fmt: on
self.files = { # mapping for different component bodies
"modeling": {},
"configuration": {},
"tokenization": {},
"processing": {},
"image_processing": {},
"feature_extractor": {},
}
self.match_patterns = "|".join(self.files.keys())
self.all_functions = {}
def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
"""When visiting imports from `transformers.models.xxx` we need to:
@@ -393,7 +548,7 @@ class DiffConverterTransformer(CSTTransformer):
import_statement = self.python_module.code_for_node(node.module)
if m.matches(node.module, m.Attribute()):
for imported_ in node.names:
_import = re.search(r"transformers\.models\..*\.(modeling|configuration)_.*", import_statement)
_import = re.search(rf"(transformers\.models\..|..)*\.({self.match_patterns})_.*", import_statement)
if _import:
source = _import.groups()[0]
if source == "modeling" and "Config" in self.python_module.code_for_node(imported_):
@@ -401,44 +556,38 @@ class DiffConverterTransformer(CSTTransformer):
f"You are importing {self.python_module.code_for_node(imported_)} from the modeling file. Import from the `configuration_xxxx.py` file instead"
)
if import_statement not in self.transformers_imports:
if "models" not in import_statement:
import_statement = "models." + import_statement
if "transformers" not in import_statement:
import_statement = "transformers." + import_statement
source_code = get_module_source_from_name(import_statement)
tree = cst.parse_module(source_code)
self.transformers_imports[import_statement] = tree
imported_class = self.python_module.code_for_node(imported_.name)
self.imported_mapping[imported_class] = import_statement
def leave_FunctionDef(self, original_node, node):
parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
if m.matches(parent_node, m.Module()):
self.global_scope_index += 100
self.new_body[node.name.value] = {"insert_idx": self.global_scope_index, "node": node}
return node
if m.matches(node.module, m.Name()):
if "transformers" == import_statement:
raise ValueError(
f"You are importing from {import_statement} directly using global imports. Import from the correct local path"
)
def leave_SimpleStatementLine(self, original_node, updated_node):
parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
if m.matches(parent_node, m.Module()):
if m.matches(updated_node, m.SimpleStatementLine(body=[m.Import()])):
if parent_node not in self.all_imports:
if updated_node not in self.all_imports:
self.all_imports.append(updated_node)
return updated_node
elif m.matches(updated_node, m.SimpleStatementLine(body=[m.ImportFrom()])):
full_statement = self.python_module.code_for_node(updated_node.body[0].module)
if re.search(r"transformers\.models\..*\.(modeling|configuration)_.*", full_statement):
if re.search(
rf"(transformers\.models\..|..)*\.({self.match_patterns})_.*", full_statement
): # OR MATCH ..llama.modeling_llama
return cst.RemoveFromParent()
if parent_node not in self.all_imports:
if updated_node not in self.all_imports:
self.all_imports.append(updated_node)
return updated_node
self.global_scope_index += 100
if m.matches(updated_node, m.SimpleStatementLine(body=[m.Assign()])):
# TODO This only works for single target assigns!
node_name = updated_node.body[0].targets[0].target.value
else:
node_name = self.python_module.code_for_node(updated_node.body[0])
self.new_body[node_name] = {
"insert_idx": self.global_scope_index,
"node": updated_node,
}
self.config_body = [updated_node]
return updated_node
def leave_ClassDef(self, original_node, updated_node):
@@ -454,6 +603,7 @@ class DiffConverterTransformer(CSTTransformer):
"""
class_name = original_node.name.value
bases = [k.value.value for k in original_node.bases if k.value.value in self.imported_mapping]
all_bases = [k.value.value for k in original_node.bases]
self.global_scope_index += 100
for super_class in bases:
if super_class not in self.imported_mapping:
@@ -469,7 +619,7 @@ class DiffConverterTransformer(CSTTransformer):
raise ValueError(
f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name"
)
file_type = re.search(r"models?\.\w*?\.(\w*?)_", super_file_name).groups()[0]
visited_module = self.visited_module
if super_file_name not in visited_module: # only extract classes once
class_finder = find_classes_in_file(
@@ -490,22 +640,47 @@ class DiffConverterTransformer(CSTTransformer):
list_dependencies = sorted(list_dependencies.items(), key=lambda x: x[1], reverse=True)
start_insert_idx = self.global_scope_index
file_to_update = self.files[file_type]
is_empty_node = self.python_module.code_for_node(original_node.body) == "pass\n"
for dependency, _ in list_dependencies:
# we can write to the correct body, using the source of the parent class
node = class_finder.global_nodes.get(dependency, None)
if node is not None and "Config" not in class_name:
if dependency not in self.new_body:
if node is not None:
if dependency not in file_to_update:
start_insert_idx -= 1
self.new_body[dependency] = {"insert_idx": start_insert_idx, "node": node}
file_to_update[dependency] = {"insert_idx": start_insert_idx, "node": node}
elif dependency not in self.inserted_deps:
# make sure the node is written after its dependencies
start_insert_idx = self.new_body[dependency]["insert_idx"] - 1
start_insert_idx = file_to_update[dependency]["insert_idx"] - 1
if (
dependency in file_to_update.keys()
and dependency in class_finder.first_lvl_dependency_mapping[class_name]
):
# If dependency is defined, but not used, raise error
calls = m.findall(original_node, m.Call(func=m.Name(dependency)))
if not calls and not is_empty_node and dependency not in all_bases:
raise ValueError(
f"""You defined `{dependency}` in the modular_{self.model_name}.py, it should be used
when you define `{class_name}`, as it is one of it's direct dependencies. Make sure
you use it in the `__init__` function."""
)
self.inserted_deps.append(dependency)
if len(list_dependencies) > 0:
updated_node = replace_call_to_super(class_finder, updated_node, class_name)
if "Config" in class_name:
self.config_body += [updated_node]
else:
raise ValueError(
f"Unable to find dependencies for {super_class} in {super_file_name}. Here are the dependencies found: {class_finder.class_dependency_mapping}. (The automatic renaming might have gone wrong!)"
)
# Now, if a class was defined without parents, we look for the name
match_pattern = "|".join(TYPE_TO_FILE_TYPE.keys())
match = re.search(rf"({match_pattern})$", class_name)
if match:
key = TYPE_TO_FILE_TYPE[match.group(1)]
self.files[key][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
else:
self.new_body[class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
self.files["modeling"][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
return updated_node
def leave_If(self, original_node, node):
@@ -513,66 +688,69 @@ class DiffConverterTransformer(CSTTransformer):
if m.matches(parent_node, m.Module()):
full_statement = self.python_module.code_for_node(original_node.test)
if re.search(r"[\s\S]*is_.*available", full_statement):
self.all_imports.append(node)
self.all_safe_imports.append(node)
elif full_statement not in self.new_body:
self.new_body[node] = {"insert_idx": self.global_scope_index, "node": node}
return node
def leave_Module(self, original_node: cst.Assign, node):
imports = {self.python_module.code_for_node(k): k for k in self.all_imports}
dependency_imports = {}
config_imports = []
for visiter in self.visited_module.values():
dependency_imports.update({self.python_module.code_for_node(k): k for k in visiter.imports.values()})
dependency_imports = {file_type: imports.copy() for file_type in self.files}
for super_file_name, visiter in self.visited_module.items():
file_type = re.search(r"models?\.\w*?\.(\w*?)_", super_file_name).groups()[0]
dependency_imports[file_type].update(
{self.python_module.code_for_node(k): k for k in visiter.imports.values()}
)
# manually clean up if it's importing a config from configuration file (ruff doesn't do that)
config_imports = []
for i in list(dependency_imports.values()):
if (
hasattr(i.body[0], "module")
and isinstance(i.body[0].module, cst.Name)
and f"configuration_{self.model_name}" in i.body[0].module.value
):
pass
else:
config_imports.append(i)
for file, body in self.files.items():
new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
if len(new_body) > 0:
if file in dependency_imports.keys():
new_body = list(dependency_imports[file].values()) + new_body
self.files[file] = cst.Module(body=[*new_body], header=node.header)
return node
if hasattr(self, "config_body"):
self.config_body = list(imports.values()) + config_imports + self.config_body
dependency_imports.update(imports)
new_body = list(dependency_imports.values())
if len(self.new_body.keys()) > 0:
new_body += [k[1]["node"] for k in sorted(self.new_body.items(), key=lambda x: x[1]["insert_idx"])]
def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, cst_transformers=None):
pattern = re.search(r"modular_(.*)(?=\.py$)", modular_file)
output = {}
if pattern is not None:
model_name = pattern.groups()[0]
# Parse the Python file
with open(modular_file, "r") as file:
code = file.read()
module = cst.parse_module(code)
wrapper = MetadataWrapper(module)
if cst_transformers is None:
cst_transformers = ModularConverterTransformer(module, model_name, old_model_name, new_model_name)
wrapper.visit(cst_transformers)
for file, node in cst_transformers.files.items():
if node != {}:
ruffed_code = run_ruff(AUTO_GENERATED_MESSAGE + node.code, True)
formatted_code = run_ruff(ruffed_code, False)
output[file] = [formatted_code, ruffed_code]
return output
else:
print(f"modular pattern not found in {modular_file}, exiting")
return {}
def save_modeling_file(modular_file, converted_file):
for file_type in converted_file.keys():
non_comment_lines = len(
[line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
)
if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0:
with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f:
f.write(converted_file[file_type][0])
else:
new_body = []
return node.with_changes(body=[*new_body])
def convert_file(diff_file, old_model_name=None, new_model_name=None, cst_transformers=None):
model_name = re.search(r"diff_(.*)(?=\.py$)", diff_file).groups()[0]
# Parse the Python file
with open(diff_file, "r") as file:
code = file.read()
module = cst.parse_module(code)
wrapper = MetadataWrapper(module)
if cst_transformers is None:
cst_transformers = DiffConverterTransformer(module, model_name, old_model_name, new_model_name)
new_mod = wrapper.visit(cst_transformers)
ruffed_code = run_ruff(new_mod.code, True)
formatted_code = run_ruff(ruffed_code, False)
if len(formatted_code.strip()) > 0:
with open(diff_file.replace("diff_", "modeling_"), "w") as f:
f.write(AUTO_GENERATED_MESSAGE + formatted_code)
if hasattr(cst_transformers, "config_body"):
config_module = cst.Module(body=[*cst_transformers.config_body], header=new_mod.header)
with open(diff_file.replace("diff_", "configuration_"), "w") as f:
ruffed_code = run_ruff(config_module.code, True)
formatted_code = run_ruff(ruffed_code, False)
f.write(AUTO_GENERATED_MESSAGE + formatted_code)
# TODO optimize by re-using the class_finder
return cst_transformers
non_comment_lines = len(
[line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
)
if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0:
logger.warning("The modeling code contains errors, it's written without formatting")
with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f:
f.write(converted_file[file_type][1])
if __name__ == "__main__":
@@ -581,22 +759,24 @@ if __name__ == "__main__":
"--files_to_parse",
default=["all"],
nargs="+",
help="A list of `diff_xxxx` files that should be converted to single model file",
help="A list of `modular_xxxx` files that should be converted to single model file",
)
parser.add_argument(
"--old_model_name",
required=False,
help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from diff-file",
help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from modular-file",
)
parser.add_argument(
"--new_model_name",
required=False,
help="The name of the new model being added in CamelCase. If not provided is inferred from diff-file",
help="The name of the new model being added in CamelCase. If not provided is inferred from modular-file",
)
args = parser.parse_args()
if args.files_to_parse == ["all"]:
args.files_to_parse = glob.glob("src/transformers/models/**/diff_*.py", recursive=True)
for file_name in args.files_to_parse:
args.files_to_parse = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
for file_name in find_priority_list(args.files_to_parse):
print(f"Converting {file_name} to a single model single file format")
module_path = file_name.replace("/", ".").replace(".py", "").replace("src.", "")
converter = convert_file(file_name, args.old_model_name, args.new_model_name)
converted_files = convert_modular_file(file_name, args.old_model_name, args.new_model_name)
converter = save_modeling_file(file_name, converted_files)