More utils doc (#25457)

* Document and clean more utils. * More documentation and fixes * Switch to Lysandre's token * Address review comments * Actually put else
2023-08-17 07:58:35 +02:00
parent 36f183ebab
commit 2defb6b048
9 changed files with 411 additions and 84 deletions
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -12,12 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that sorts the imports in the custom inits of Transformers. Transformers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff`
+properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half.

+Use from the root of the repo with:
+
+```bash
+python utils/custom_init_isort.py
+```
+
+which will auto-sort the imports (used in `make style`).
+
+For a check only (as used in `make quality`) run:
+
+```bash
+python utils/custom_init_isort.py --check_only
+```
+"""
 import argparse
 import os
 import re
+from typing import Any, Callable, List, Optional


+# Path is defined with the intent you should run this script from the root of the repo.
 PATH_TO_TRANSFORMERS = "src/transformers"

 # Pattern that looks at the indentation in a line.
@@ -32,17 +55,30 @@ _re_strip_line = re.compile(r'^\s*"([^"]+)",\s*$')
 _re_bracket_content = re.compile(r"\[([^\]]+)\]")


-def get_indent(line):
-    """Returns the indent in `line`."""
+def get_indent(line: str) -> str:
+    """Returns the indent in  given line (as string)."""
    search = _re_indent.search(line)
    return "" if search is None else search.groups()[0]


-def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_prompt=None):
+def split_code_in_indented_blocks(
+    code: str, indent_level: str = "", start_prompt: Optional[str] = None, end_prompt: Optional[str] = None
+) -> List[str]:
    """
-    Split `code` into its indented blocks, starting at `indent_level`. If provided, begins splitting after
-    `start_prompt` and stops at `end_prompt` (but returns what's before `start_prompt` as a first block and what's
-    after `end_prompt` as a last block, so `code` is always the same as joining the result of this function).
+    Split some code into its indented blocks, starting at a given level.
+
+    Args:
+        code (`str`): The code to split.
+        indent_level (`str`): The indent level (as string) to use for identifying the blocks to split.
+        start_prompt (`str`, *optional*): If provided, only starts splitting at the line where this text is.
+        end_prompt (`str`, *optional*): If provided, stops splitting at a line where this text is.
+
+    Warning:
+        The text before `start_prompt` or after `end_prompt` (if provided) is not ignored, just not split. The input `code`
+        can thus be retrieved by joining the result.
+
+    Returns:
+        `List[str]`: The list of blocks.
    """
    # Let's split the code into lines and move to start_index.
    index = 0
@@ -54,12 +90,17 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
    else:
        blocks = []

-    # We split into blocks until we get to the `end_prompt` (or the end of the block).
+    # This variable contains the block treated at a given time.
    current_block = [lines[index]]
    index += 1
+    # We split into blocks until we get to the `end_prompt` (or the end of the file).
    while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)):
+        # We have a non-empty line with the proper indent -> start of a new block
        if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level:
+            # Store the current block in the result and rest. There are two cases: the line is part of the block (like
+            # a closing parenthesis) or not.
            if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "):
+                # Line is part of the current block
                current_block.append(lines[index])
                blocks.append("\n".join(current_block))
                if index < len(lines) - 1:
@@ -68,9 +109,11 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
                else:
                    current_block = []
            else:
+                # Line is not part of the current block
                blocks.append("\n".join(current_block))
                current_block = [lines[index]]
        else:
+            # Just add the line to the current block
            current_block.append(lines[index])
        index += 1

@@ -85,8 +128,10 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
    return blocks


-def ignore_underscore(key):
-    "Wraps a `key` (that maps an object to string) to lower case and remove underscores."
+def ignore_underscore_and_lowercase(key: Callable[[Any], str]) -> Callable[[Any], str]:
+    """
+    Wraps a key function (as used in a sort) to lowercase and ignore underscores.
+    """

    def _inner(x):
        return key(x).lower().replace("_", "")
@@ -94,8 +139,21 @@ def ignore_underscore(key):
    return _inner


-def sort_objects(objects, key=None):
-    "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str."
+def sort_objects(objects: List[Any], key: Optional[Callable[[Any], str]] = None) -> List[Any]:
+    """
+    Sort a list of objects following the rules of isort (all uppercased first, camel-cased second and lower-cased
+    last).
+
+    Args:
+        objects (`List[Any]`):
+            The list of objects to sort.
+        key (`Callable[[Any], str]`, *optional*):
+            A function taking an object as input and returning a string, used to sort them by alphabetical order.
+            If not provided, will default to noop (so a `key` must be provided if the `objects` are not of type string).
+
+    Returns:
+        `List[Any]`: The sorted list with the same elements as in the inputs
+    """

    # If no key is provided, we use a noop.
    def noop(x):
@@ -110,18 +168,26 @@ def sort_objects(objects, key=None):
    # Functions begin with a lowercase, they go last.
    functions = [obj for obj in objects if not key(obj)[0].isupper()]

-    key1 = ignore_underscore(key)
+    # Then we sort each group.
+    key1 = ignore_underscore_and_lowercase(key)
    return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1)


-def sort_objects_in_import(import_statement):
+def sort_objects_in_import(import_statement: str) -> str:
    """
-    Return the same `import_statement` but with objects properly sorted.
+    Sorts the imports in a single import statement.
+
+    Args:
+        import_statement (`str`): The import statement in which to sort the imports.
+
+    Returns:
+        `str`: The same as the input, but with objects properly sorted.
    """

    # This inner function sort imports between [ ].
    def _replace(match):
        imports = match.groups()[0]
+        # If there is one import only, nothing to do.
        if "," not in imports:
            return f"[{imports}]"
        keys = [part.strip().replace('"', "") for part in imports.split(",")]
@@ -165,13 +231,18 @@ def sort_objects_in_import(import_statement):
        return import_statement


-def sort_imports(file, check_only=True):
+def sort_imports(file: str, check_only: bool = True):
    """
-    Sort `_import_structure` imports in `file`, `check_only` determines if we only check or overwrite.
+    Sort the imports defined in the `_import_structure` of a given init.
+
+    Args:
+        file (`str`): The path to the init to check/fix.
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
    """
    with open(file, encoding="utf-8") as f:
        code = f.read()

+    # If the file is not a custom init, there is nothing to do.
    if "_import_structure" not in code:
        return

@@ -234,6 +305,12 @@ def sort_imports(file, check_only=True):


 def sort_imports_in_all_inits(check_only=True):
+    """
+    Sort the imports defined in the `_import_structure` of all inits in the repo.
+
+    Args:
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
+    """
    failures = []
    for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
        if "__init__.py" in files: