🎯 Trackio integration (#38814)

* First attempt

* fix

* fix

* Enhance TrackioCallback to log GPU memory usage and allocation

* Enhance Trackio integration in callbacks and training arguments documentation

* re order

* remove unused lines

* fix torch optional
This commit is contained in:
Quentin Gallouédec
2025-07-22 14:50:20 -07:00
committed by GitHub
parent c6d0500d15
commit 6e9972962f
7 changed files with 150 additions and 9 deletions

View File

@@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
it's the second one). it's the second one).
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
or tensorboardX). or tensorboardX).
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed. - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed. - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed. - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:
[[autodoc]] integrations.TensorBoardCallback [[autodoc]] integrations.TensorBoardCallback
[[autodoc]] integrations.TrackioCallback
- setup
[[autodoc]] integrations.WandbCallback [[autodoc]] integrations.WandbCallback
- setup - setup

View File

@@ -127,6 +127,7 @@ _import_structure = {
"is_sigopt_available", "is_sigopt_available",
"is_swanlab_available", "is_swanlab_available",
"is_tensorboard_available", "is_tensorboard_available",
"is_trackio_available",
"is_wandb_available", "is_wandb_available",
], ],
"loss": [], "loss": [],
@@ -759,6 +760,7 @@ if TYPE_CHECKING:
is_sigopt_available, is_sigopt_available,
is_swanlab_available, is_swanlab_available,
is_tensorboard_available, is_tensorboard_available,
is_trackio_available,
is_wandb_available, is_wandb_available,
) )
from .integrations.executorch import ( from .integrations.executorch import (

View File

@@ -90,6 +90,7 @@ _import_structure = {
"NeptuneMissingConfiguration", "NeptuneMissingConfiguration",
"SwanLabCallback", "SwanLabCallback",
"TensorBoardCallback", "TensorBoardCallback",
"TrackioCallback",
"WandbCallback", "WandbCallback",
"get_available_reporting_integrations", "get_available_reporting_integrations",
"get_reporting_integration_callbacks", "get_reporting_integration_callbacks",
@@ -110,6 +111,7 @@ _import_structure = {
"is_sigopt_available", "is_sigopt_available",
"is_swanlab_available", "is_swanlab_available",
"is_tensorboard_available", "is_tensorboard_available",
"is_trackio_available",
"is_wandb_available", "is_wandb_available",
"rewrite_logs", "rewrite_logs",
"run_hp_search_optuna", "run_hp_search_optuna",
@@ -224,6 +226,7 @@ if TYPE_CHECKING:
NeptuneMissingConfiguration, NeptuneMissingConfiguration,
SwanLabCallback, SwanLabCallback,
TensorBoardCallback, TensorBoardCallback,
TrackioCallback,
WandbCallback, WandbCallback,
get_available_reporting_integrations, get_available_reporting_integrations,
get_reporting_integration_callbacks, get_reporting_integration_callbacks,
@@ -244,6 +247,7 @@ if TYPE_CHECKING:
is_sigopt_available, is_sigopt_available,
is_swanlab_available, is_swanlab_available,
is_tensorboard_available, is_tensorboard_available,
is_trackio_available,
is_wandb_available, is_wandb_available,
rewrite_logs, rewrite_logs,
run_hp_search_optuna, run_hp_search_optuna,

View File

@@ -56,6 +56,7 @@ logger = logging.get_logger(__name__)
if is_torch_available(): if is_torch_available():
import torch import torch
import torch.distributed as dist
# comet_ml requires to be imported before any ML frameworks # comet_ml requires to be imported before any ML frameworks
_MIN_COMET_VERSION = "3.43.2" _MIN_COMET_VERSION = "3.43.2"
@@ -111,6 +112,10 @@ def is_wandb_available():
return importlib.util.find_spec("wandb") is not None return importlib.util.find_spec("wandb") is not None
def is_trackio_available():
return importlib.util.find_spec("trackio") is not None
def is_clearml_available(): def is_clearml_available():
return importlib.util.find_spec("clearml") is not None return importlib.util.find_spec("clearml") is not None
@@ -630,6 +635,8 @@ def get_available_reporting_integrations():
integrations.append("clearml") integrations.append("clearml")
if is_swanlab_available(): if is_swanlab_available():
integrations.append("swanlab") integrations.append("swanlab")
if is_trackio_available():
integrations.append("trackio")
return integrations return integrations
@@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback):
self._wandb.log(metrics) self._wandb.log(metrics)
class TrackioCallback(TrainerCallback):
"""
A [`TrainerCallback`] that logs metrics to Trackio.
"""
def __init__(self):
has_trackio = is_trackio_available()
if not has_trackio:
raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.")
if has_trackio:
import trackio
self._trackio = trackio
self._initialized = False
def setup(self, args, state, model, **kwargs):
"""
Setup the optional Trackio integration.
To customize the setup you can also override the following environment variables:
Environment:
- **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
The name of the project (can be an existing project to continue tracking or a new project to start tracking
from scratch).
- **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`):
If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a
complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case
the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not
exist, it will be created. If the Space already exists, the project will be logged to it.
"""
if state.is_world_process_zero:
combined_dict = {**args.to_dict()}
if hasattr(model, "config") and model.config is not None:
model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
combined_dict = {**model_config, **combined_dict}
if hasattr(model, "peft_config") and model.peft_config is not None:
peft_config = model.peft_config
combined_dict = {**{"peft_config": peft_config}, **combined_dict}
self._trackio.init(
project=os.getenv("TRACKIO_PROJECT", "huggingface"),
name=args.run_name,
space_id=os.getenv("TRACKIO_SPACE_ID", None),
resume="allow",
)
# Add config parameters (run may have been created manually)
self._trackio.config.update(combined_dict, allow_val_change=True)
# Add number of model parameters to trackio config
try:
self._trackio.config["model/num_parameters"] = model.num_parameters()
except AttributeError:
logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.")
self._initialized = True
def on_train_begin(self, args, state, control, model=None, **kwargs):
if not self._initialized:
self.setup(args, state, model, **kwargs)
def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs):
if state.is_world_process_zero and self._initialized:
self._trackio.finish()
def on_log(self, args, state, control, model=None, logs=None, **kwargs):
single_value_scalars = [
"train_runtime",
"train_samples_per_second",
"train_steps_per_second",
"train_loss",
"total_flos",
]
if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
device_idx = torch.cuda.current_device()
total_memory = torch.cuda.get_device_properties(device_idx).total_memory
memory_allocated = torch.cuda.memory_allocated(device_idx)
gpu_memory_logs = {
f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB
f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio
}
gathered_logs = [None] * dist.get_world_size()
dist.all_gather_object(gathered_logs, gpu_memory_logs)
gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
if not self._initialized:
self.setup(args, state, model)
if state.is_world_process_zero:
non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
non_scalar_logs = rewrite_logs(non_scalar_logs)
self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step})
def on_save(self, args, state, control, **kwargs):
return
def on_predict(self, args, state, control, metrics, **kwargs):
if self._trackio is None:
return
if not self._initialized:
self.setup(args, state, **kwargs)
if state.is_world_process_zero:
metrics = rewrite_logs(metrics)
self._trackio.log(metrics)
class CometCallback(TrainerCallback): class CometCallback(TrainerCallback):
""" """
A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/). A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
@@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = {
"mlflow": MLflowCallback, "mlflow": MLflowCallback,
"neptune": NeptuneCallback, "neptune": NeptuneCallback,
"tensorboard": TensorBoardCallback, "tensorboard": TensorBoardCallback,
"trackio": TrackioCallback,
"wandb": WandbCallback, "wandb": WandbCallback,
"codecarbon": CodeCarbonCallback, "codecarbon": CodeCarbonCallback,
"clearml": ClearMLCallback, "clearml": ClearMLCallback,

View File

@@ -59,6 +59,7 @@ from .integrations import (
is_sigopt_available, is_sigopt_available,
is_swanlab_available, is_swanlab_available,
is_tensorboard_available, is_tensorboard_available,
is_trackio_available,
is_wandb_available, is_wandb_available,
) )
from .integrations.deepspeed import is_deepspeed_available from .integrations.deepspeed import is_deepspeed_available
@@ -1274,6 +1275,16 @@ def require_swanlab(test_case):
return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case) return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
def require_trackio(test_case):
"""
Decorator marking a test that requires trackio.
These tests are skipped when trackio isn't installed.
"""
return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
def require_wandb(test_case): def require_wandb(test_case):
""" """
Decorator marking a test that requires wandb. Decorator marking a test that requires wandb.

View File

@@ -438,9 +438,9 @@ class TrainingArguments:
use the corresponding output (usually index 2) as the past state and feed it to the model at the next use the corresponding output (usually index 2) as the past state and feed it to the model at the next
training step under the keyword argument `mems`. training step under the keyword argument `mems`.
run_name (`str`, *optional*, defaults to `output_dir`): run_name (`str`, *optional*, defaults to `output_dir`):
A descriptor for the run. Typically used for [wandb](https://www.wandb.com/), A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio),
[mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn) [wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and
logging. If not specified, will be the same as `output_dir`. [swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`.
disable_tqdm (`bool`, *optional*): disable_tqdm (`bool`, *optional*):
Whether or not to disable the tqdm progress bars and table of metrics produced by Whether or not to disable the tqdm progress bars and table of metrics produced by
[`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -626,8 +626,8 @@ class TrainingArguments:
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`): report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`, `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
`"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
for no integrations. installed, `"none"` for no integrations.
ddp_find_unused_parameters (`bool`, *optional*): ddp_find_unused_parameters (`bool`, *optional*):
When using distributed training, the value of the flag `find_unused_parameters` passed to When using distributed training, the value of the flag `find_unused_parameters` passed to
`DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise. `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
@@ -1182,7 +1182,10 @@ class TrainingArguments:
run_name: Optional[str] = field( run_name: Optional[str] = field(
default=None, default=None,
metadata={ metadata={
"help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging." "help": (
"An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab "
"logging."
)
}, },
) )
disable_tqdm: Optional[bool] = field( disable_tqdm: Optional[bool] = field(
@@ -2838,8 +2841,8 @@ class TrainingArguments:
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`): report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
`"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations `"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
installed, `"none"` for no integrations. integrations installed, `"none"` for no integrations.
first_step (`bool`, *optional*, defaults to `False`): first_step (`bool`, *optional*, defaults to `False`):
Whether to log and evaluate the first `global_step` or not. Whether to log and evaluate the first `global_step` or not.
nan_inf_filter (`bool`, *optional*, defaults to `True`): nan_inf_filter (`bool`, *optional*, defaults to `True`):

View File

@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
automatically detect from metadata. automatically detect from metadata.
run_name (`str`, *optional*): run_name (`str`, *optional*):
A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging. A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging.
xla (`bool`, *optional*): xla (`bool`, *optional*):
Whether to activate the XLA compilation or not. Whether to activate the XLA compilation or not.
""" """