Adds use_repr to model_addition_debugger_context (#37984)

* Adds use_repr to model_addition_debugger_context * Updating docs for use_repr option
2025-05-23 05:35:13 -04:00
parent 38f9c5b15b
commit 9eb0a37c9e
2 changed files with 175 additions and 67 deletions
--- a/src/transformers/model_debugging_utils.py
+++ b/src/transformers/model_debugging_utils.py
@@ -21,6 +21,8 @@ from contextlib import contextmanager, redirect_stdout
 from io import StringIO
 from typing import Optional

+from safetensors.torch import save_file
+
 from transformers.utils.import_utils import requires

 from .utils import is_torch_available
@@ -65,64 +67,94 @@ def _dtensor_repr(x):
    return "DTensor(non-rank0)"


-def _serialize_io(value):
+def _serialize_tensor_like_io(
+    value, debug_path: Optional[str] = None, use_repr: bool = True, path_to_value: Optional[str] = None
+):
+    """
+    Converts Tensors and DTensors to a JSON-serializable dictionary representation.
+
+    Args:
+        value: Any Python object, often including torch Tensors, lists, dicts, etc.
+        debug_path (`str`, *optional*, defaults to `None`): Directory to dump debug JSON and SafeTensors files.
+        use_repr (bool, *optional*, defaults to `True`): Whether to save a `repr()`-ized version of the tensor as the
+            `value` property in the asscoiated FULL_TENSORS.json file, or to store the full tensors in separate
+            SafeTensors file and store the relative path to that file in the `value` property in the dictionary.
+        path_to_value (`str`, *optional*, defaults to `None`): The file name for the SafeTensors file holding the full
+            tensor value if `use_repr=False`.
+
+    Returns:
+        A nested Python structure (list, dict, or sanitized string) that is safe to json.dump.
+    """
+    torch.set_printoptions(sci_mode=True)
+
+    if use_repr:
+        value_out = _repr_to_list(value)
+    elif path_to_value:
+        if not path_to_value.endswith(".safetensors"):
+            path_to_value += ".safetensors"
+
+        filepath = os.path.join(debug_path, path_to_value) if debug_path else path_to_value
+        save_file({"data": value.contiguous().detach().cpu()}, filepath)
+        value_out = f"./{path_to_value}"
+    else:
+        raise ValueError(f"{use_repr=} and {path_to_value=} cannot both be falsy.")
+
+    out = {
+        "shape": repr(value.shape),
+        "dtype": repr(value.dtype),
+        "value": value_out,
+    }
+    if value.dtype in {torch.float16, torch.float32, torch.bfloat16}:
+        out.update(
+            {
+                "mean": _sanitize_repr_for_diff(repr(value.mean())),
+                "std": _sanitize_repr_for_diff(repr(value.std())),
+                "min": _sanitize_repr_for_diff(repr(value.min())),
+                "max": _sanitize_repr_for_diff(repr(value.max())),
+            }
+        )
+    return out
+
+
+def _serialize_io(value, debug_path: Optional[str] = None, use_repr: bool = True, path_to_value: Optional[str] = None):
    """
    Recursively build a JSON-serializable Python structure from `value`.
-    Tensors and DTensors become sanitized repr strings.
+    Tensors and DTensors become either sanitized repr strings, or are saved to disk as SafeTensors files and their
+    relative paths are recorded in the returned Python structure.
    Lists/tuples/dicts are recursed into.
    All memory addresses are replaced with a stable placeholder.

    Args:
        value: Any Python object, often including torch Tensors, lists, dicts, etc.
+        debug_path (`str`, *optional*, defaults to `None`): Directory to dump debug JSON and SafeTensors files.
+        use_repr (bool, *optional*, defaults to `True`): Whether to save a `repr()`-ized version of the tensors as the
+            `value` property in the asscoiated FULL_TENSORS.json file, or to store full tensors in separate SafeTensors
+            files and store the relative path to that file in the `value` property.
+        path_to_value (`str`, *optional*, defaults to `None`): The file name for the SafeTensors file holding the full
+            tensor value if `use_repr=False`.

    Returns:
        A nested Python structure (list, dict, or sanitized string) that is safe to json.dump.
    """
    if isinstance(value, (list, tuple)):
-        return [_serialize_io(v) for v in value]
+        return [
+            _serialize_io(v, debug_path=debug_path, use_repr=use_repr, path_to_value=f"{path_to_value}_{i}")
+            for i, v in enumerate(value)
+        ]

    if isinstance(value, dict):
-        return {k: _serialize_io(v) for k, v in value.items()}
+        return {
+            k: _serialize_io(v, debug_path=debug_path, use_repr=use_repr, path_to_value=f"{path_to_value}_{k}")
+            for k, v in value.items()
+        }

    if hasattr(value, "_local_tensor"):
-        # DTensor-like handling, just use local tensor attribute
-        torch.set_printoptions(sci_mode=True)
-        val_repr = _repr_to_list(value)
-        out = {
-            "shape": repr(value._local_tensor.shape),
-            "dtype": repr(value._local_tensor.dtype),
-            "value": val_repr,
-        }
-        if value._local_tensor.dtype in {torch.float16, torch.float32, torch.bfloat16}:
-            value = value._local_tensor.clone()
-            out.update(
-                {
-                    "mean": _sanitize_repr_for_diff(repr(value.mean())),
-                    "std": _sanitize_repr_for_diff(repr(value.std())),
-                    "min": _sanitize_repr_for_diff(repr(value.min())),
-                    "max": _sanitize_repr_for_diff(repr(value.max())),
-                }
-            )
-        return out
+        return _serialize_tensor_like_io(
+            value._local_tensor, debug_path=debug_path, use_repr=use_repr, path_to_value=path_to_value
+        )

    if isinstance(value, torch.Tensor):
-        torch.set_printoptions(sci_mode=True)
-        val_repr = _repr_to_list(value)
-        out = {
-            "shape": repr(value.shape),
-            "dtype": repr(value.dtype),
-            "value": val_repr,
-        }
-        if value.dtype in {torch.float16, torch.float32, torch.bfloat16}:
-            out.update(
-                {
-                    "mean": _sanitize_repr_for_diff(repr(value.mean())),
-                    "std": _sanitize_repr_for_diff(repr(value.std())),
-                    "min": _sanitize_repr_for_diff(repr(value.min())),
-                    "max": _sanitize_repr_for_diff(repr(value.max())),
-                }
-            )
-        return out
+        return _serialize_tensor_like_io(value, debug_path=debug_path, use_repr=use_repr, path_to_value=path_to_value)

    return _sanitize_repr_for_diff(repr(value))

@@ -199,7 +231,7 @@ def log_model_debug_trace(debug_path, model):
            os.makedirs(debug_path, exist_ok=True)
            base = os.path.join(debug_path, model._debugger_module_dump_name + "_debug_tree")
        except Exception as e:
-            raise ValueError(f"Unexpected or existing debug_path={debug_path}. {e}")
+            raise ValueError(f"Unexpected or existing debug_path={debug_path}.") from e
    else:
        base = model._debugger_module_dump_name + "_debug_tree"

@@ -240,6 +272,7 @@ def _attach_debugger_logic(
    model,
    debug_path: Optional[str] = ".",
    do_prune_layers: Optional[bool] = True,
+    use_repr: bool = True,
 ):
    """
    Attaches a debugging wrapper to every module in the model.
@@ -250,6 +283,9 @@ def _attach_debugger_logic(
        model (`PreTrainedModel`, `nn.Module`): Model to wrap.
        debug_path (`str`): Optional directory to dump debug JSON files.
        do_prune_layers (`bool`, *optional*, defaults to `True`): Whether to prune intermediate layers.
+        use_repr (bool, *optional*, defaults to `True`): Whether to save a `repr()`-ized version of the tensors as the
+            `value` property in the asscoiated FULL_TENSORS.json file, or to store full tensors in separate SafeTensors
+            files and store the relative path to that file in the `value` property.
    """
    class_name = model.__class__.__name__

@@ -258,6 +294,12 @@ def _attach_debugger_logic(
    model._debugger_model_call_stack = []
    model._debugger_module_dump_name = class_name  # used for final JSON filename

+    if debug_path:
+        try:
+            os.makedirs(debug_path, exist_ok=True)
+        except Exception as e:
+            raise ValueError(f"Unexpected or existing debug_path={debug_path}.") from e
+
    def wrap_forward(module, full_path):
        orig_forward = module.forward

@@ -268,7 +310,12 @@ def _attach_debugger_logic(
                dict_inputs = {k: dict_inputs[k] for k in dict_inputs if len(dict_inputs[k]) > 0}
                node = {
                    "module_path": full_path,
-                    "inputs": _serialize_io(dict_inputs),
+                    "inputs": _serialize_io(
+                        dict_inputs,
+                        debug_path=debug_path,
+                        use_repr=use_repr,
+                        path_to_value=f"{full_path}_inputs",
+                    ),
                    "outputs": None,
                    "children": [],
                }
@@ -280,7 +327,12 @@ def _attach_debugger_logic(
                if sum(1 for _ in module.named_children()) > 0:
                    node["outputs"] = None
                else:
-                    node["outputs"] = _serialize_io(out)
+                    node["outputs"] = _serialize_io(
+                        out,
+                        debug_path=debug_path,
+                        use_repr=use_repr,
+                        path_to_value=f"{full_path}_outputs",
+                    )

                finished = model._debugger_model_call_stack.pop()
                # prune empty vertices here as well (mostly empty children nodes)
@@ -307,7 +359,12 @@ def _attach_debugger_logic(
        if _is_rank_zero():
            top_node = {
                "module_path": f"{class_name} (top-level)",
-                "inputs": _serialize_io({"args": inps, "kwargs": kws}),
+                "inputs": _serialize_io(
+                    {"args": inps, "kwargs": kws},
+                    debug_path=debug_path,
+                    use_repr=use_repr,
+                    path_to_value=f"{class_name}_inputs",
+                ),
                "outputs": None,
                "children": [],
            }
@@ -315,7 +372,12 @@ def _attach_debugger_logic(

        out = real_top_forward(*inps, **kws)
        if _is_rank_zero() and model._debugger_model_call_stack:
-            top_node["outputs"] = _serialize_io(out)
+            top_node["outputs"] = _serialize_io(
+                out,
+                debug_path=debug_path,
+                use_repr=use_repr,
+                path_to_value=f"{class_name}_outputs",
+            )
            finished = model._debugger_model_call_stack.pop()
            model._call_tree["inputs"] = finished["inputs"]
            model._call_tree["outputs"] = finished["outputs"]
@@ -335,11 +397,21 @@ def _attach_debugger_logic(

@requires(backends=("torch",))
@contextmanager
-def model_addition_debugger_context(model, debug_path: Optional[str] = None, do_prune_layers: Optional[bool] = True):
+def model_addition_debugger_context(
+    model,
+    debug_path: Optional[str] = None,
+    do_prune_layers: Optional[bool] = True,
+    use_repr: Optional[bool] = True,
+):
    """
    # Model addition debugger - context manager for model adders
    This context manager is a power user tool intended for model adders.
-    It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
+
+    It tracks all forward calls within a model forward and logs a slice of each input and output on a nested JSON file.
+    If `use_repr=True` (the default), the JSON file will record a `repr()`-ized version of the tensors as a list of
+    strings. If `use_repr=False`, the full tensors will be stored in spearate SafeTensors files and the JSON file will
+    provide a relative path to that file.
+
    To note, this context manager enforces `torch.no_grad()`.

    ## Usage
@@ -348,10 +420,10 @@ def model_addition_debugger_context(model, debug_path: Optional[str] = None, do_

    ```python
    import torch
+
    from PIL import Image
-    import requests
-    from transformers import LlavaProcessor, LlavaForConditionalGeneration
-    from transformers.model_debugging_utils import model_addition_debugger_context
+    from transformers import LlavaProcessor, LlavaForConditionalGeneration, model_addition_debugger_context
+
    torch.random.manual_seed(673)

    # load pretrained model and processor
@@ -376,7 +448,7 @@ def model_addition_debugger_context(model, debug_path: Optional[str] = None, do_
    """
    orig_forwards = {m: m.forward for _, m in model.named_modules()}
    orig_forwards[model] = model.forward
-    _attach_debugger_logic(model, debug_path, do_prune_layers)
+    _attach_debugger_logic(model, debug_path, do_prune_layers, use_repr)
    try:
        yield model
    finally: