🎯 Trackio integration (#38814)
* First attempt * fix * fix * Enhance TrackioCallback to log GPU memory usage and allocation * Enhance Trackio integration in callbacks and training arguments documentation * re order * remove unused lines * fix torch optional
This commit is contained in:
committed by
GitHub
parent
c6d0500d15
commit
6e9972962f
@@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
|
||||
it's the second one).
|
||||
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
|
||||
or tensorboardX).
|
||||
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
|
||||
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
|
||||
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
|
||||
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
|
||||
@@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:
|
||||
|
||||
[[autodoc]] integrations.TensorBoardCallback
|
||||
|
||||
[[autodoc]] integrations.TrackioCallback
|
||||
- setup
|
||||
|
||||
[[autodoc]] integrations.WandbCallback
|
||||
- setup
|
||||
|
||||
|
||||
@@ -127,6 +127,7 @@ _import_structure = {
|
||||
"is_sigopt_available",
|
||||
"is_swanlab_available",
|
||||
"is_tensorboard_available",
|
||||
"is_trackio_available",
|
||||
"is_wandb_available",
|
||||
],
|
||||
"loss": [],
|
||||
@@ -759,6 +760,7 @@ if TYPE_CHECKING:
|
||||
is_sigopt_available,
|
||||
is_swanlab_available,
|
||||
is_tensorboard_available,
|
||||
is_trackio_available,
|
||||
is_wandb_available,
|
||||
)
|
||||
from .integrations.executorch import (
|
||||
|
||||
@@ -90,6 +90,7 @@ _import_structure = {
|
||||
"NeptuneMissingConfiguration",
|
||||
"SwanLabCallback",
|
||||
"TensorBoardCallback",
|
||||
"TrackioCallback",
|
||||
"WandbCallback",
|
||||
"get_available_reporting_integrations",
|
||||
"get_reporting_integration_callbacks",
|
||||
@@ -110,6 +111,7 @@ _import_structure = {
|
||||
"is_sigopt_available",
|
||||
"is_swanlab_available",
|
||||
"is_tensorboard_available",
|
||||
"is_trackio_available",
|
||||
"is_wandb_available",
|
||||
"rewrite_logs",
|
||||
"run_hp_search_optuna",
|
||||
@@ -224,6 +226,7 @@ if TYPE_CHECKING:
|
||||
NeptuneMissingConfiguration,
|
||||
SwanLabCallback,
|
||||
TensorBoardCallback,
|
||||
TrackioCallback,
|
||||
WandbCallback,
|
||||
get_available_reporting_integrations,
|
||||
get_reporting_integration_callbacks,
|
||||
@@ -244,6 +247,7 @@ if TYPE_CHECKING:
|
||||
is_sigopt_available,
|
||||
is_swanlab_available,
|
||||
is_tensorboard_available,
|
||||
is_trackio_available,
|
||||
is_wandb_available,
|
||||
rewrite_logs,
|
||||
run_hp_search_optuna,
|
||||
|
||||
@@ -56,6 +56,7 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
# comet_ml requires to be imported before any ML frameworks
|
||||
_MIN_COMET_VERSION = "3.43.2"
|
||||
@@ -111,6 +112,10 @@ def is_wandb_available():
|
||||
return importlib.util.find_spec("wandb") is not None
|
||||
|
||||
|
||||
def is_trackio_available():
|
||||
return importlib.util.find_spec("trackio") is not None
|
||||
|
||||
|
||||
def is_clearml_available():
|
||||
return importlib.util.find_spec("clearml") is not None
|
||||
|
||||
@@ -630,6 +635,8 @@ def get_available_reporting_integrations():
|
||||
integrations.append("clearml")
|
||||
if is_swanlab_available():
|
||||
integrations.append("swanlab")
|
||||
if is_trackio_available():
|
||||
integrations.append("trackio")
|
||||
return integrations
|
||||
|
||||
|
||||
@@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback):
|
||||
self._wandb.log(metrics)
|
||||
|
||||
|
||||
class TrackioCallback(TrainerCallback):
|
||||
"""
|
||||
A [`TrainerCallback`] that logs metrics to Trackio.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
has_trackio = is_trackio_available()
|
||||
if not has_trackio:
|
||||
raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.")
|
||||
if has_trackio:
|
||||
import trackio
|
||||
|
||||
self._trackio = trackio
|
||||
self._initialized = False
|
||||
|
||||
def setup(self, args, state, model, **kwargs):
|
||||
"""
|
||||
Setup the optional Trackio integration.
|
||||
|
||||
To customize the setup you can also override the following environment variables:
|
||||
|
||||
Environment:
|
||||
- **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
|
||||
The name of the project (can be an existing project to continue tracking or a new project to start tracking
|
||||
from scratch).
|
||||
- **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`):
|
||||
If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a
|
||||
complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case
|
||||
the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not
|
||||
exist, it will be created. If the Space already exists, the project will be logged to it.
|
||||
"""
|
||||
if state.is_world_process_zero:
|
||||
combined_dict = {**args.to_dict()}
|
||||
|
||||
if hasattr(model, "config") and model.config is not None:
|
||||
model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
|
||||
combined_dict = {**model_config, **combined_dict}
|
||||
if hasattr(model, "peft_config") and model.peft_config is not None:
|
||||
peft_config = model.peft_config
|
||||
combined_dict = {**{"peft_config": peft_config}, **combined_dict}
|
||||
|
||||
self._trackio.init(
|
||||
project=os.getenv("TRACKIO_PROJECT", "huggingface"),
|
||||
name=args.run_name,
|
||||
space_id=os.getenv("TRACKIO_SPACE_ID", None),
|
||||
resume="allow",
|
||||
)
|
||||
|
||||
# Add config parameters (run may have been created manually)
|
||||
self._trackio.config.update(combined_dict, allow_val_change=True)
|
||||
|
||||
# Add number of model parameters to trackio config
|
||||
try:
|
||||
self._trackio.config["model/num_parameters"] = model.num_parameters()
|
||||
except AttributeError:
|
||||
logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.")
|
||||
self._initialized = True
|
||||
|
||||
def on_train_begin(self, args, state, control, model=None, **kwargs):
|
||||
if not self._initialized:
|
||||
self.setup(args, state, model, **kwargs)
|
||||
|
||||
def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs):
|
||||
if state.is_world_process_zero and self._initialized:
|
||||
self._trackio.finish()
|
||||
|
||||
def on_log(self, args, state, control, model=None, logs=None, **kwargs):
|
||||
single_value_scalars = [
|
||||
"train_runtime",
|
||||
"train_samples_per_second",
|
||||
"train_steps_per_second",
|
||||
"train_loss",
|
||||
"total_flos",
|
||||
]
|
||||
|
||||
if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
|
||||
device_idx = torch.cuda.current_device()
|
||||
total_memory = torch.cuda.get_device_properties(device_idx).total_memory
|
||||
memory_allocated = torch.cuda.memory_allocated(device_idx)
|
||||
|
||||
gpu_memory_logs = {
|
||||
f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB
|
||||
f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio
|
||||
}
|
||||
|
||||
gathered_logs = [None] * dist.get_world_size()
|
||||
dist.all_gather_object(gathered_logs, gpu_memory_logs)
|
||||
gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
|
||||
|
||||
if not self._initialized:
|
||||
self.setup(args, state, model)
|
||||
if state.is_world_process_zero:
|
||||
non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
|
||||
non_scalar_logs = rewrite_logs(non_scalar_logs)
|
||||
self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step})
|
||||
|
||||
def on_save(self, args, state, control, **kwargs):
|
||||
return
|
||||
|
||||
def on_predict(self, args, state, control, metrics, **kwargs):
|
||||
if self._trackio is None:
|
||||
return
|
||||
if not self._initialized:
|
||||
self.setup(args, state, **kwargs)
|
||||
if state.is_world_process_zero:
|
||||
metrics = rewrite_logs(metrics)
|
||||
self._trackio.log(metrics)
|
||||
|
||||
|
||||
class CometCallback(TrainerCallback):
|
||||
"""
|
||||
A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
|
||||
@@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = {
|
||||
"mlflow": MLflowCallback,
|
||||
"neptune": NeptuneCallback,
|
||||
"tensorboard": TensorBoardCallback,
|
||||
"trackio": TrackioCallback,
|
||||
"wandb": WandbCallback,
|
||||
"codecarbon": CodeCarbonCallback,
|
||||
"clearml": ClearMLCallback,
|
||||
|
||||
@@ -59,6 +59,7 @@ from .integrations import (
|
||||
is_sigopt_available,
|
||||
is_swanlab_available,
|
||||
is_tensorboard_available,
|
||||
is_trackio_available,
|
||||
is_wandb_available,
|
||||
)
|
||||
from .integrations.deepspeed import is_deepspeed_available
|
||||
@@ -1274,6 +1275,16 @@ def require_swanlab(test_case):
|
||||
return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
|
||||
|
||||
|
||||
def require_trackio(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires trackio.
|
||||
|
||||
These tests are skipped when trackio isn't installed.
|
||||
|
||||
"""
|
||||
return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
|
||||
|
||||
|
||||
def require_wandb(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires wandb.
|
||||
|
||||
@@ -438,9 +438,9 @@ class TrainingArguments:
|
||||
use the corresponding output (usually index 2) as the past state and feed it to the model at the next
|
||||
training step under the keyword argument `mems`.
|
||||
run_name (`str`, *optional*, defaults to `output_dir`):
|
||||
A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
|
||||
[mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn)
|
||||
logging. If not specified, will be the same as `output_dir`.
|
||||
A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio),
|
||||
[wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and
|
||||
[swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`.
|
||||
disable_tqdm (`bool`, *optional*):
|
||||
Whether or not to disable the tqdm progress bars and table of metrics produced by
|
||||
[`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
|
||||
@@ -626,8 +626,8 @@ class TrainingArguments:
|
||||
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
|
||||
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
||||
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
|
||||
`"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
|
||||
for no integrations.
|
||||
`"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
|
||||
installed, `"none"` for no integrations.
|
||||
ddp_find_unused_parameters (`bool`, *optional*):
|
||||
When using distributed training, the value of the flag `find_unused_parameters` passed to
|
||||
`DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
|
||||
@@ -1182,7 +1182,10 @@ class TrainingArguments:
|
||||
run_name: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging."
|
||||
"help": (
|
||||
"An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab "
|
||||
"logging."
|
||||
)
|
||||
},
|
||||
)
|
||||
disable_tqdm: Optional[bool] = field(
|
||||
@@ -2838,8 +2841,8 @@ class TrainingArguments:
|
||||
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
|
||||
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
||||
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
|
||||
`"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations
|
||||
installed, `"none"` for no integrations.
|
||||
`"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
|
||||
integrations installed, `"none"` for no integrations.
|
||||
first_step (`bool`, *optional*, defaults to `False`):
|
||||
Whether to log and evaluate the first `global_step` or not.
|
||||
nan_inf_filter (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
|
||||
Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
|
||||
automatically detect from metadata.
|
||||
run_name (`str`, *optional*):
|
||||
A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging.
|
||||
A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging.
|
||||
xla (`bool`, *optional*):
|
||||
Whether to activate the XLA compilation or not.
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user