From 6e9972962fbc80d218234bfbd8c9b2843ef02b2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Tue, 22 Jul 2025 14:50:20 -0700
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=AF=20Trackio=20integration=20(#38814)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* First attempt

* fix

* fix

* Enhance TrackioCallback to log GPU memory usage and allocation

* Enhance Trackio integration in callbacks and training arguments documentation

* re order

* remove unused lines

* fix torch optional
---
 docs/source/en/main_classes/callback.md       |   4 +
 src/transformers/__init__.py                  |   2 +
 src/transformers/integrations/__init__.py     |   4 +
 .../integrations/integration_utils.py         | 117 ++++++++++++++++++
 src/transformers/testing_utils.py             |  11 ++
 src/transformers/training_args.py             |  19 +--
 src/transformers/training_args_tf.py          |   2 +-
 7 files changed, 150 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md
index 99f76b7b05..0a7c73c667 100644
--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
   it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
   or tensorboardX).
+- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:
 
 [[autodoc]] integrations.TensorBoardCallback
 
+[[autodoc]] integrations.TrackioCallback
+    - setup
+
 [[autodoc]] integrations.WandbCallback
     - setup
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 84892590b1..f6f6fd6f6e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -127,6 +127,7 @@ _import_structure = {
         "is_sigopt_available",
         "is_swanlab_available",
         "is_tensorboard_available",
+        "is_trackio_available",
         "is_wandb_available",
     ],
     "loss": [],
@@ -759,6 +760,7 @@ if TYPE_CHECKING:
         is_sigopt_available,
         is_swanlab_available,
         is_tensorboard_available,
+        is_trackio_available,
         is_wandb_available,
     )
     from .integrations.executorch import (
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 1b87a554d3..0c4d169380 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -90,6 +90,7 @@ _import_structure = {
         "NeptuneMissingConfiguration",
         "SwanLabCallback",
         "TensorBoardCallback",
+        "TrackioCallback",
         "WandbCallback",
         "get_available_reporting_integrations",
         "get_reporting_integration_callbacks",
@@ -110,6 +111,7 @@ _import_structure = {
         "is_sigopt_available",
         "is_swanlab_available",
         "is_tensorboard_available",
+        "is_trackio_available",
         "is_wandb_available",
         "rewrite_logs",
         "run_hp_search_optuna",
@@ -224,6 +226,7 @@ if TYPE_CHECKING:
         NeptuneMissingConfiguration,
         SwanLabCallback,
         TensorBoardCallback,
+        TrackioCallback,
         WandbCallback,
         get_available_reporting_integrations,
         get_reporting_integration_callbacks,
@@ -244,6 +247,7 @@ if TYPE_CHECKING:
         is_sigopt_available,
         is_swanlab_available,
         is_tensorboard_available,
+        is_trackio_available,
         is_wandb_available,
         rewrite_logs,
         run_hp_search_optuna,
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 2b7fd9e756..8a621929c7 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -56,6 +56,7 @@ logger = logging.get_logger(__name__)
 
 if is_torch_available():
     import torch
+    import torch.distributed as dist
 
 # comet_ml requires to be imported before any ML frameworks
 _MIN_COMET_VERSION = "3.43.2"
@@ -111,6 +112,10 @@ def is_wandb_available():
     return importlib.util.find_spec("wandb") is not None
 
 
+def is_trackio_available():
+    return importlib.util.find_spec("trackio") is not None
+
+
 def is_clearml_available():
     return importlib.util.find_spec("clearml") is not None
 
@@ -630,6 +635,8 @@ def get_available_reporting_integrations():
         integrations.append("clearml")
     if is_swanlab_available():
         integrations.append("swanlab")
+    if is_trackio_available():
+        integrations.append("trackio")
     return integrations
 
 
@@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback):
             self._wandb.log(metrics)
 
 
+class TrackioCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs metrics to Trackio.
+    """
+
+    def __init__(self):
+        has_trackio = is_trackio_available()
+        if not has_trackio:
+            raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.")
+        if has_trackio:
+            import trackio
+
+            self._trackio = trackio
+        self._initialized = False
+
+    def setup(self, args, state, model, **kwargs):
+        """
+        Setup the optional Trackio integration.
+
+        To customize the setup you can also override the following environment variables:
+
+        Environment:
+        - **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
+            The name of the project (can be an existing project to continue tracking or a new project to start tracking
+            from scratch).
+        - **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`):
+            If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a
+            complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case
+            the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not
+            exist, it will be created. If the Space already exists, the project will be logged to it.
+        """
+        if state.is_world_process_zero:
+            combined_dict = {**args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            if hasattr(model, "peft_config") and model.peft_config is not None:
+                peft_config = model.peft_config
+                combined_dict = {**{"peft_config": peft_config}, **combined_dict}
+
+            self._trackio.init(
+                project=os.getenv("TRACKIO_PROJECT", "huggingface"),
+                name=args.run_name,
+                space_id=os.getenv("TRACKIO_SPACE_ID", None),
+                resume="allow",
+            )
+
+            # Add config parameters (run may have been created manually)
+            self._trackio.config.update(combined_dict, allow_val_change=True)
+
+            # Add number of model parameters to trackio config
+            try:
+                self._trackio.config["model/num_parameters"] = model.num_parameters()
+            except AttributeError:
+                logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.")
+        self._initialized = True
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model, **kwargs)
+
+    def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs):
+        if state.is_world_process_zero and self._initialized:
+            self._trackio.finish()
+
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        single_value_scalars = [
+            "train_runtime",
+            "train_samples_per_second",
+            "train_steps_per_second",
+            "train_loss",
+            "total_flos",
+        ]
+
+        if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
+            device_idx = torch.cuda.current_device()
+            total_memory = torch.cuda.get_device_properties(device_idx).total_memory
+            memory_allocated = torch.cuda.memory_allocated(device_idx)
+
+            gpu_memory_logs = {
+                f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3),  # GB
+                f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory,  # ratio
+            }
+
+            gathered_logs = [None] * dist.get_world_size()
+            dist.all_gather_object(gathered_logs, gpu_memory_logs)
+            gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
+
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
+            non_scalar_logs = rewrite_logs(non_scalar_logs)
+            self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step})
+
+    def on_save(self, args, state, control, **kwargs):
+        return
+
+    def on_predict(self, args, state, control, metrics, **kwargs):
+        if self._trackio is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, **kwargs)
+        if state.is_world_process_zero:
+            metrics = rewrite_logs(metrics)
+            self._trackio.log(metrics)
+
+
 class CometCallback(TrainerCallback):
     """
     A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
@@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = {
     "mlflow": MLflowCallback,
     "neptune": NeptuneCallback,
     "tensorboard": TensorBoardCallback,
+    "trackio": TrackioCallback,
     "wandb": WandbCallback,
     "codecarbon": CodeCarbonCallback,
     "clearml": ClearMLCallback,
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 0e117d71f7..d6b425cca6 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -59,6 +59,7 @@ from .integrations import (
     is_sigopt_available,
     is_swanlab_available,
     is_tensorboard_available,
+    is_trackio_available,
     is_wandb_available,
 )
 from .integrations.deepspeed import is_deepspeed_available
@@ -1274,6 +1275,16 @@ def require_swanlab(test_case):
     return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
 
 
+def require_trackio(test_case):
+    """
+    Decorator marking a test that requires trackio.
+
+    These tests are skipped when trackio isn't installed.
+
+    """
+    return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
+
+
 def require_wandb(test_case):
     """
     Decorator marking a test that requires wandb.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 16056a5dde..cf5ece295e 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -438,9 +438,9 @@ class TrainingArguments:
             use the corresponding output (usually index 2) as the past state and feed it to the model at the next
             training step under the keyword argument `mems`.
         run_name (`str`, *optional*, defaults to `output_dir`):
-            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
-            [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn)
-            logging. If not specified, will be the same as `output_dir`.
+            A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio),
+            [wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and
+            [swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`.
         disable_tqdm (`bool`, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
             [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -626,8 +626,8 @@ class TrainingArguments:
         report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
             `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
-            `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
-            for no integrations.
+            `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
+            installed, `"none"` for no integrations.
         ddp_find_unused_parameters (`bool`, *optional*):
             When using distributed training, the value of the flag `find_unused_parameters` passed to
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
@@ -1182,7 +1182,10 @@ class TrainingArguments:
     run_name: Optional[str] = field(
         default=None,
         metadata={
-            "help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging."
+            "help": (
+                "An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab "
+                "logging."
+            )
         },
     )
     disable_tqdm: Optional[bool] = field(
@@ -2838,8 +2841,8 @@ class TrainingArguments:
             report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
                 The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
                 `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
-                `"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations
-                installed, `"none"` for no integrations.
+                `"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
+                integrations installed, `"none"` for no integrations.
             first_step (`bool`, *optional*, defaults to `False`):
                 Whether to log and evaluate the first `global_step` or not.
             nan_inf_filter (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 6bbd4b89a7..cf20503d63 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
             Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
             automatically detect from metadata.
         run_name (`str`, *optional*):
-            A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging.
+            A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging.
         xla (`bool`, *optional*):
             Whether to activate the XLA compilation or not.
     """