🎯 Trackio integration (#38814)

* First attempt

* fix

* fix

* Enhance TrackioCallback to log GPU memory usage and allocation

* Enhance Trackio integration in callbacks and training arguments documentation

* re order

* remove unused lines

* fix torch optional
This commit is contained in:
Quentin Gallouédec
2025-07-22 14:50:20 -07:00
committed by GitHub
parent c6d0500d15
commit 6e9972962f
7 changed files with 150 additions and 9 deletions

View File

@@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
it's the second one).
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
or tensorboardX).
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:
[[autodoc]] integrations.TensorBoardCallback
[[autodoc]] integrations.TrackioCallback
- setup
[[autodoc]] integrations.WandbCallback
- setup

View File

@@ -127,6 +127,7 @@ _import_structure = {
"is_sigopt_available",
"is_swanlab_available",
"is_tensorboard_available",
"is_trackio_available",
"is_wandb_available",
],
"loss": [],
@@ -759,6 +760,7 @@ if TYPE_CHECKING:
is_sigopt_available,
is_swanlab_available,
is_tensorboard_available,
is_trackio_available,
is_wandb_available,
)
from .integrations.executorch import (

View File

@@ -90,6 +90,7 @@ _import_structure = {
"NeptuneMissingConfiguration",
"SwanLabCallback",
"TensorBoardCallback",
"TrackioCallback",
"WandbCallback",
"get_available_reporting_integrations",
"get_reporting_integration_callbacks",
@@ -110,6 +111,7 @@ _import_structure = {
"is_sigopt_available",
"is_swanlab_available",
"is_tensorboard_available",
"is_trackio_available",
"is_wandb_available",
"rewrite_logs",
"run_hp_search_optuna",
@@ -224,6 +226,7 @@ if TYPE_CHECKING:
NeptuneMissingConfiguration,
SwanLabCallback,
TensorBoardCallback,
TrackioCallback,
WandbCallback,
get_available_reporting_integrations,
get_reporting_integration_callbacks,
@@ -244,6 +247,7 @@ if TYPE_CHECKING:
is_sigopt_available,
is_swanlab_available,
is_tensorboard_available,
is_trackio_available,
is_wandb_available,
rewrite_logs,
run_hp_search_optuna,

View File

@@ -56,6 +56,7 @@ logger = logging.get_logger(__name__)
if is_torch_available():
import torch
import torch.distributed as dist
# comet_ml requires to be imported before any ML frameworks
_MIN_COMET_VERSION = "3.43.2"
@@ -111,6 +112,10 @@ def is_wandb_available():
return importlib.util.find_spec("wandb") is not None
def is_trackio_available():
return importlib.util.find_spec("trackio") is not None
def is_clearml_available():
return importlib.util.find_spec("clearml") is not None
@@ -630,6 +635,8 @@ def get_available_reporting_integrations():
integrations.append("clearml")
if is_swanlab_available():
integrations.append("swanlab")
if is_trackio_available():
integrations.append("trackio")
return integrations
@@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback):
self._wandb.log(metrics)
class TrackioCallback(TrainerCallback):
"""
A [`TrainerCallback`] that logs metrics to Trackio.
"""
def __init__(self):
has_trackio = is_trackio_available()
if not has_trackio:
raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.")
if has_trackio:
import trackio
self._trackio = trackio
self._initialized = False
def setup(self, args, state, model, **kwargs):
"""
Setup the optional Trackio integration.
To customize the setup you can also override the following environment variables:
Environment:
- **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
The name of the project (can be an existing project to continue tracking or a new project to start tracking
from scratch).
- **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`):
If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a
complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case
the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not
exist, it will be created. If the Space already exists, the project will be logged to it.
"""
if state.is_world_process_zero:
combined_dict = {**args.to_dict()}
if hasattr(model, "config") and model.config is not None:
model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
combined_dict = {**model_config, **combined_dict}
if hasattr(model, "peft_config") and model.peft_config is not None:
peft_config = model.peft_config
combined_dict = {**{"peft_config": peft_config}, **combined_dict}
self._trackio.init(
project=os.getenv("TRACKIO_PROJECT", "huggingface"),
name=args.run_name,
space_id=os.getenv("TRACKIO_SPACE_ID", None),
resume="allow",
)
# Add config parameters (run may have been created manually)
self._trackio.config.update(combined_dict, allow_val_change=True)
# Add number of model parameters to trackio config
try:
self._trackio.config["model/num_parameters"] = model.num_parameters()
except AttributeError:
logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.")
self._initialized = True
def on_train_begin(self, args, state, control, model=None, **kwargs):
if not self._initialized:
self.setup(args, state, model, **kwargs)
def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs):
if state.is_world_process_zero and self._initialized:
self._trackio.finish()
def on_log(self, args, state, control, model=None, logs=None, **kwargs):
single_value_scalars = [
"train_runtime",
"train_samples_per_second",
"train_steps_per_second",
"train_loss",
"total_flos",
]
if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
device_idx = torch.cuda.current_device()
total_memory = torch.cuda.get_device_properties(device_idx).total_memory
memory_allocated = torch.cuda.memory_allocated(device_idx)
gpu_memory_logs = {
f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB
f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio
}
gathered_logs = [None] * dist.get_world_size()
dist.all_gather_object(gathered_logs, gpu_memory_logs)
gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
if not self._initialized:
self.setup(args, state, model)
if state.is_world_process_zero:
non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
non_scalar_logs = rewrite_logs(non_scalar_logs)
self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step})
def on_save(self, args, state, control, **kwargs):
return
def on_predict(self, args, state, control, metrics, **kwargs):
if self._trackio is None:
return
if not self._initialized:
self.setup(args, state, **kwargs)
if state.is_world_process_zero:
metrics = rewrite_logs(metrics)
self._trackio.log(metrics)
class CometCallback(TrainerCallback):
"""
A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
@@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = {
"mlflow": MLflowCallback,
"neptune": NeptuneCallback,
"tensorboard": TensorBoardCallback,
"trackio": TrackioCallback,
"wandb": WandbCallback,
"codecarbon": CodeCarbonCallback,
"clearml": ClearMLCallback,

View File

@@ -59,6 +59,7 @@ from .integrations import (
is_sigopt_available,
is_swanlab_available,
is_tensorboard_available,
is_trackio_available,
is_wandb_available,
)
from .integrations.deepspeed import is_deepspeed_available
@@ -1274,6 +1275,16 @@ def require_swanlab(test_case):
return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
def require_trackio(test_case):
"""
Decorator marking a test that requires trackio.
These tests are skipped when trackio isn't installed.
"""
return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
def require_wandb(test_case):
"""
Decorator marking a test that requires wandb.

View File

@@ -438,9 +438,9 @@ class TrainingArguments:
use the corresponding output (usually index 2) as the past state and feed it to the model at the next
training step under the keyword argument `mems`.
run_name (`str`, *optional*, defaults to `output_dir`):
A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
[mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn)
logging. If not specified, will be the same as `output_dir`.
A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio),
[wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and
[swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`.
disable_tqdm (`bool`, *optional*):
Whether or not to disable the tqdm progress bars and table of metrics produced by
[`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -626,8 +626,8 @@ class TrainingArguments:
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
`"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
for no integrations.
`"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
installed, `"none"` for no integrations.
ddp_find_unused_parameters (`bool`, *optional*):
When using distributed training, the value of the flag `find_unused_parameters` passed to
`DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
@@ -1182,7 +1182,10 @@ class TrainingArguments:
run_name: Optional[str] = field(
default=None,
metadata={
"help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging."
"help": (
"An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab "
"logging."
)
},
)
disable_tqdm: Optional[bool] = field(
@@ -2838,8 +2841,8 @@ class TrainingArguments:
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
`"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations
installed, `"none"` for no integrations.
`"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
integrations installed, `"none"` for no integrations.
first_step (`bool`, *optional*, defaults to `False`):
Whether to log and evaluate the first `global_step` or not.
nan_inf_filter (`bool`, *optional*, defaults to `True`):

View File

@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
automatically detect from metadata.
run_name (`str`, *optional*):
A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging.
A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging.
xla (`bool`, *optional*):
Whether to activate the XLA compilation or not.
"""