🎯 Trackio integration (#38814)

* First attempt * fix * fix * Enhance TrackioCallback to log GPU memory usage and allocation * Enhance Trackio integration in callbacks and training arguments documentation * re order * remove unused lines * fix torch optional
2025-07-22 14:50:20 -07:00
parent c6d0500d15
commit 6e9972962f
7 changed files with 150 additions and 9 deletions
--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
  it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
+- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:

 [[autodoc]] integrations.TensorBoardCallback

+[[autodoc]] integrations.TrackioCallback
+    - setup
+
 [[autodoc]] integrations.WandbCallback
    - setup

--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -127,6 +127,7 @@ _import_structure = {
        "is_sigopt_available",
        "is_swanlab_available",
        "is_tensorboard_available",
+        "is_trackio_available",
        "is_wandb_available",
    ],
    "loss": [],
@@ -759,6 +760,7 @@ if TYPE_CHECKING:
        is_sigopt_available,
        is_swanlab_available,
        is_tensorboard_available,
+        is_trackio_available,
        is_wandb_available,
    )
    from .integrations.executorch import (
--- a/src/transformers/integrations/init.py
+++ b/src/transformers/integrations/init.py
@@ -90,6 +90,7 @@ _import_structure = {
        "NeptuneMissingConfiguration",
        "SwanLabCallback",
        "TensorBoardCallback",
+        "TrackioCallback",
        "WandbCallback",
        "get_available_reporting_integrations",
        "get_reporting_integration_callbacks",
@@ -110,6 +111,7 @@ _import_structure = {
        "is_sigopt_available",
        "is_swanlab_available",
        "is_tensorboard_available",
+        "is_trackio_available",
        "is_wandb_available",
        "rewrite_logs",
        "run_hp_search_optuna",
@@ -224,6 +226,7 @@ if TYPE_CHECKING:
        NeptuneMissingConfiguration,
        SwanLabCallback,
        TensorBoardCallback,
+        TrackioCallback,
        WandbCallback,
        get_available_reporting_integrations,
        get_reporting_integration_callbacks,
@@ -244,6 +247,7 @@ if TYPE_CHECKING:
        is_sigopt_available,
        is_swanlab_available,
        is_tensorboard_available,
+        is_trackio_available,
        is_wandb_available,
        rewrite_logs,
        run_hp_search_optuna,
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -56,6 +56,7 @@ logger = logging.get_logger(__name__)

 if is_torch_available():
    import torch
+    import torch.distributed as dist

 # comet_ml requires to be imported before any ML frameworks
 _MIN_COMET_VERSION = "3.43.2"
@@ -111,6 +112,10 @@ def is_wandb_available():
    return importlib.util.find_spec("wandb") is not None


+def is_trackio_available():
+    return importlib.util.find_spec("trackio") is not None
+
+
 def is_clearml_available():
    return importlib.util.find_spec("clearml") is not None

@@ -630,6 +635,8 @@ def get_available_reporting_integrations():
        integrations.append("clearml")
    if is_swanlab_available():
        integrations.append("swanlab")
+    if is_trackio_available():
+        integrations.append("trackio")
    return integrations


@@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback):
            self._wandb.log(metrics)


+class TrackioCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs metrics to Trackio.
+    """
+
+    def __init__(self):
+        has_trackio = is_trackio_available()
+        if not has_trackio:
+            raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.")
+        if has_trackio:
+            import trackio
+
+            self._trackio = trackio
+        self._initialized = False
+
+    def setup(self, args, state, model, **kwargs):
+        """
+        Setup the optional Trackio integration.
+
+        To customize the setup you can also override the following environment variables:
+
+        Environment:
+        - **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
+            The name of the project (can be an existing project to continue tracking or a new project to start tracking
+            from scratch).
+        - **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`):
+            If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a
+            complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case
+            the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not
+            exist, it will be created. If the Space already exists, the project will be logged to it.
+        """
+        if state.is_world_process_zero:
+            combined_dict = {**args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            if hasattr(model, "peft_config") and model.peft_config is not None:
+                peft_config = model.peft_config
+                combined_dict = {**{"peft_config": peft_config}, **combined_dict}
+
+            self._trackio.init(
+                project=os.getenv("TRACKIO_PROJECT", "huggingface"),
+                name=args.run_name,
+                space_id=os.getenv("TRACKIO_SPACE_ID", None),
+                resume="allow",
+            )
+
+            # Add config parameters (run may have been created manually)
+            self._trackio.config.update(combined_dict, allow_val_change=True)
+
+            # Add number of model parameters to trackio config
+            try:
+                self._trackio.config["model/num_parameters"] = model.num_parameters()
+            except AttributeError:
+                logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.")
+        self._initialized = True
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model, **kwargs)
+
+    def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs):
+        if state.is_world_process_zero and self._initialized:
+            self._trackio.finish()
+
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        single_value_scalars = [
+            "train_runtime",
+            "train_samples_per_second",
+            "train_steps_per_second",
+            "train_loss",
+            "total_flos",
+        ]
+
+        if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
+            device_idx = torch.cuda.current_device()
+            total_memory = torch.cuda.get_device_properties(device_idx).total_memory
+            memory_allocated = torch.cuda.memory_allocated(device_idx)
+
+            gpu_memory_logs = {
+                f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3),  # GB
+                f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory,  # ratio
+            }
+
+            gathered_logs = [None] * dist.get_world_size()
+            dist.all_gather_object(gathered_logs, gpu_memory_logs)
+            gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
+
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
+            non_scalar_logs = rewrite_logs(non_scalar_logs)
+            self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step})
+
+    def on_save(self, args, state, control, **kwargs):
+        return
+
+    def on_predict(self, args, state, control, metrics, **kwargs):
+        if self._trackio is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, **kwargs)
+        if state.is_world_process_zero:
+            metrics = rewrite_logs(metrics)
+            self._trackio.log(metrics)
+
+
 class CometCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
@@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = {
    "mlflow": MLflowCallback,
    "neptune": NeptuneCallback,
    "tensorboard": TensorBoardCallback,
+    "trackio": TrackioCallback,
    "wandb": WandbCallback,
    "codecarbon": CodeCarbonCallback,
    "clearml": ClearMLCallback,
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -59,6 +59,7 @@ from .integrations import (
    is_sigopt_available,
    is_swanlab_available,
    is_tensorboard_available,
+    is_trackio_available,
    is_wandb_available,
 )
 from .integrations.deepspeed import is_deepspeed_available
@@ -1274,6 +1275,16 @@ def require_swanlab(test_case):
    return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)


+def require_trackio(test_case):
+    """
+    Decorator marking a test that requires trackio.
+
+    These tests are skipped when trackio isn't installed.
+
+    """
+    return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
+
+
 def require_wandb(test_case):
    """
    Decorator marking a test that requires wandb.
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -438,9 +438,9 @@ class TrainingArguments:
            use the corresponding output (usually index 2) as the past state and feed it to the model at the next
            training step under the keyword argument `mems`.
        run_name (`str`, *optional*, defaults to `output_dir`):
-            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
-            [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn)
-            logging. If not specified, will be the same as `output_dir`.
+            A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio),
+            [wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and
+            [swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`.
        disable_tqdm (`bool`, *optional*):
            Whether or not to disable the tqdm progress bars and table of metrics produced by
            [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -626,8 +626,8 @@ class TrainingArguments:
        report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
            The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
            `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
-            `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
-            for no integrations.
+            `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
+            installed, `"none"` for no integrations.
        ddp_find_unused_parameters (`bool`, *optional*):
            When using distributed training, the value of the flag `find_unused_parameters` passed to
            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
@@ -1182,7 +1182,10 @@ class TrainingArguments:
    run_name: Optional[str] = field(
        default=None,
        metadata={
-            "help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging."
+            "help": (
+                "An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab "
+                "logging."
+            )
        },
    )
    disable_tqdm: Optional[bool] = field(
@@ -2838,8 +2841,8 @@ class TrainingArguments:
            report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
                The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
                `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
-                `"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations
-                installed, `"none"` for no integrations.
+                `"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
+                integrations installed, `"none"` for no integrations.
            first_step (`bool`, *optional*, defaults to `False`):
                Whether to log and evaluate the first `global_step` or not.
            nan_inf_filter (`bool`, *optional*, defaults to `True`):
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
            Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
            automatically detect from metadata.
        run_name (`str`, *optional*):
-            A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging.
+            A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging.
        xla (`bool`, *optional*):
            Whether to activate the XLA compilation or not.
    """