From 6e9972962fbc80d218234bfbd8c9b2843ef02b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Tue, 22 Jul 2025 14:50:20 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=AF=20Trackio=20integration=20(#38814)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * First attempt * fix * fix * Enhance TrackioCallback to log GPU memory usage and allocation * Enhance Trackio integration in callbacks and training arguments documentation * re order * remove unused lines * fix torch optional --- docs/source/en/main_classes/callback.md | 4 + src/transformers/__init__.py | 2 + src/transformers/integrations/__init__.py | 4 + .../integrations/integration_utils.py | 117 ++++++++++++++++++ src/transformers/testing_utils.py | 11 ++ src/transformers/training_args.py | 19 +-- src/transformers/training_args_tf.py | 2 +- 7 files changed, 150 insertions(+), 9 deletions(-) diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md index 99f76b7b05..0a7c73c667 100644 --- a/docs/source/en/main_classes/callback.md +++ b/docs/source/en/main_classes/callback.md @@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi it's the second one). - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4 or tensorboardX). +- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed. - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed. - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed. - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed. @@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library: [[autodoc]] integrations.TensorBoardCallback +[[autodoc]] integrations.TrackioCallback + - setup + [[autodoc]] integrations.WandbCallback - setup diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 84892590b1..f6f6fd6f6e 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -127,6 +127,7 @@ _import_structure = { "is_sigopt_available", "is_swanlab_available", "is_tensorboard_available", + "is_trackio_available", "is_wandb_available", ], "loss": [], @@ -759,6 +760,7 @@ if TYPE_CHECKING: is_sigopt_available, is_swanlab_available, is_tensorboard_available, + is_trackio_available, is_wandb_available, ) from .integrations.executorch import ( diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index 1b87a554d3..0c4d169380 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -90,6 +90,7 @@ _import_structure = { "NeptuneMissingConfiguration", "SwanLabCallback", "TensorBoardCallback", + "TrackioCallback", "WandbCallback", "get_available_reporting_integrations", "get_reporting_integration_callbacks", @@ -110,6 +111,7 @@ _import_structure = { "is_sigopt_available", "is_swanlab_available", "is_tensorboard_available", + "is_trackio_available", "is_wandb_available", "rewrite_logs", "run_hp_search_optuna", @@ -224,6 +226,7 @@ if TYPE_CHECKING: NeptuneMissingConfiguration, SwanLabCallback, TensorBoardCallback, + TrackioCallback, WandbCallback, get_available_reporting_integrations, get_reporting_integration_callbacks, @@ -244,6 +247,7 @@ if TYPE_CHECKING: is_sigopt_available, is_swanlab_available, is_tensorboard_available, + is_trackio_available, is_wandb_available, rewrite_logs, run_hp_search_optuna, diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 2b7fd9e756..8a621929c7 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -56,6 +56,7 @@ logger = logging.get_logger(__name__) if is_torch_available(): import torch + import torch.distributed as dist # comet_ml requires to be imported before any ML frameworks _MIN_COMET_VERSION = "3.43.2" @@ -111,6 +112,10 @@ def is_wandb_available(): return importlib.util.find_spec("wandb") is not None +def is_trackio_available(): + return importlib.util.find_spec("trackio") is not None + + def is_clearml_available(): return importlib.util.find_spec("clearml") is not None @@ -630,6 +635,8 @@ def get_available_reporting_integrations(): integrations.append("clearml") if is_swanlab_available(): integrations.append("swanlab") + if is_trackio_available(): + integrations.append("trackio") return integrations @@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback): self._wandb.log(metrics) +class TrackioCallback(TrainerCallback): + """ + A [`TrainerCallback`] that logs metrics to Trackio. + """ + + def __init__(self): + has_trackio = is_trackio_available() + if not has_trackio: + raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.") + if has_trackio: + import trackio + + self._trackio = trackio + self._initialized = False + + def setup(self, args, state, model, **kwargs): + """ + Setup the optional Trackio integration. + + To customize the setup you can also override the following environment variables: + + Environment: + - **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`): + The name of the project (can be an existing project to continue tracking or a new project to start tracking + from scratch). + - **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`): + If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a + complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case + the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not + exist, it will be created. If the Space already exists, the project will be logged to it. + """ + if state.is_world_process_zero: + combined_dict = {**args.to_dict()} + + if hasattr(model, "config") and model.config is not None: + model_config = model.config if isinstance(model.config, dict) else model.config.to_dict() + combined_dict = {**model_config, **combined_dict} + if hasattr(model, "peft_config") and model.peft_config is not None: + peft_config = model.peft_config + combined_dict = {**{"peft_config": peft_config}, **combined_dict} + + self._trackio.init( + project=os.getenv("TRACKIO_PROJECT", "huggingface"), + name=args.run_name, + space_id=os.getenv("TRACKIO_SPACE_ID", None), + resume="allow", + ) + + # Add config parameters (run may have been created manually) + self._trackio.config.update(combined_dict, allow_val_change=True) + + # Add number of model parameters to trackio config + try: + self._trackio.config["model/num_parameters"] = model.num_parameters() + except AttributeError: + logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.") + self._initialized = True + + def on_train_begin(self, args, state, control, model=None, **kwargs): + if not self._initialized: + self.setup(args, state, model, **kwargs) + + def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs): + if state.is_world_process_zero and self._initialized: + self._trackio.finish() + + def on_log(self, args, state, control, model=None, logs=None, **kwargs): + single_value_scalars = [ + "train_runtime", + "train_samples_per_second", + "train_steps_per_second", + "train_loss", + "total_flos", + ] + + if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized(): + device_idx = torch.cuda.current_device() + total_memory = torch.cuda.get_device_properties(device_idx).total_memory + memory_allocated = torch.cuda.memory_allocated(device_idx) + + gpu_memory_logs = { + f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB + f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio + } + + gathered_logs = [None] * dist.get_world_size() + dist.all_gather_object(gathered_logs, gpu_memory_logs) + gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()} + + if not self._initialized: + self.setup(args, state, model) + if state.is_world_process_zero: + non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars} + non_scalar_logs = rewrite_logs(non_scalar_logs) + self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step}) + + def on_save(self, args, state, control, **kwargs): + return + + def on_predict(self, args, state, control, metrics, **kwargs): + if self._trackio is None: + return + if not self._initialized: + self.setup(args, state, **kwargs) + if state.is_world_process_zero: + metrics = rewrite_logs(metrics) + self._trackio.log(metrics) + + class CometCallback(TrainerCallback): """ A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/). @@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = { "mlflow": MLflowCallback, "neptune": NeptuneCallback, "tensorboard": TensorBoardCallback, + "trackio": TrackioCallback, "wandb": WandbCallback, "codecarbon": CodeCarbonCallback, "clearml": ClearMLCallback, diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 0e117d71f7..d6b425cca6 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -59,6 +59,7 @@ from .integrations import ( is_sigopt_available, is_swanlab_available, is_tensorboard_available, + is_trackio_available, is_wandb_available, ) from .integrations.deepspeed import is_deepspeed_available @@ -1274,6 +1275,16 @@ def require_swanlab(test_case): return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case) +def require_trackio(test_case): + """ + Decorator marking a test that requires trackio. + + These tests are skipped when trackio isn't installed. + + """ + return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case) + + def require_wandb(test_case): """ Decorator marking a test that requires wandb. diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 16056a5dde..cf5ece295e 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -438,9 +438,9 @@ class TrainingArguments: use the corresponding output (usually index 2) as the past state and feed it to the model at the next training step under the keyword argument `mems`. run_name (`str`, *optional*, defaults to `output_dir`): - A descriptor for the run. Typically used for [wandb](https://www.wandb.com/), - [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn) - logging. If not specified, will be the same as `output_dir`. + A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio), + [wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and + [swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`. disable_tqdm (`bool`, *optional*): Whether or not to disable the tqdm progress bars and table of metrics produced by [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is @@ -626,8 +626,8 @@ class TrainingArguments: report_to (`str` or `list[str]`, *optional*, defaults to `"all"`): The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`, - `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` - for no integrations. + `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations + installed, `"none"` for no integrations. ddp_find_unused_parameters (`bool`, *optional*): When using distributed training, the value of the flag `find_unused_parameters` passed to `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise. @@ -1182,7 +1182,10 @@ class TrainingArguments: run_name: Optional[str] = field( default=None, metadata={ - "help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging." + "help": ( + "An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab " + "logging." + ) }, ) disable_tqdm: Optional[bool] = field( @@ -2838,8 +2841,8 @@ class TrainingArguments: report_to (`str` or `list[str]`, *optional*, defaults to `"all"`): The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, - `"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations - installed, `"none"` for no integrations. + `"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all + integrations installed, `"none"` for no integrations. first_step (`bool`, *optional*, defaults to `False`): Whether to log and evaluate the first `global_step` or not. nan_inf_filter (`bool`, *optional*, defaults to `True`): diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 6bbd4b89a7..cf20503d63 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments): Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to automatically detect from metadata. run_name (`str`, *optional*): - A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging. + A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging. xla (`bool`, *optional*): Whether to activate the XLA compilation or not. """