🎯 Trackio integration (#38814)
* First attempt * fix * fix * Enhance TrackioCallback to log GPU memory usage and allocation * Enhance Trackio integration in callbacks and training arguments documentation * re order * remove unused lines * fix torch optional
This commit is contained in:
committed by
GitHub
parent
c6d0500d15
commit
6e9972962f
@@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
|
|||||||
it's the second one).
|
it's the second one).
|
||||||
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
|
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
|
||||||
or tensorboardX).
|
or tensorboardX).
|
||||||
|
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
|
||||||
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
|
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
|
||||||
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
|
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
|
||||||
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
|
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
|
||||||
@@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:
|
|||||||
|
|
||||||
[[autodoc]] integrations.TensorBoardCallback
|
[[autodoc]] integrations.TensorBoardCallback
|
||||||
|
|
||||||
|
[[autodoc]] integrations.TrackioCallback
|
||||||
|
- setup
|
||||||
|
|
||||||
[[autodoc]] integrations.WandbCallback
|
[[autodoc]] integrations.WandbCallback
|
||||||
- setup
|
- setup
|
||||||
|
|
||||||
|
|||||||
@@ -127,6 +127,7 @@ _import_structure = {
|
|||||||
"is_sigopt_available",
|
"is_sigopt_available",
|
||||||
"is_swanlab_available",
|
"is_swanlab_available",
|
||||||
"is_tensorboard_available",
|
"is_tensorboard_available",
|
||||||
|
"is_trackio_available",
|
||||||
"is_wandb_available",
|
"is_wandb_available",
|
||||||
],
|
],
|
||||||
"loss": [],
|
"loss": [],
|
||||||
@@ -759,6 +760,7 @@ if TYPE_CHECKING:
|
|||||||
is_sigopt_available,
|
is_sigopt_available,
|
||||||
is_swanlab_available,
|
is_swanlab_available,
|
||||||
is_tensorboard_available,
|
is_tensorboard_available,
|
||||||
|
is_trackio_available,
|
||||||
is_wandb_available,
|
is_wandb_available,
|
||||||
)
|
)
|
||||||
from .integrations.executorch import (
|
from .integrations.executorch import (
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ _import_structure = {
|
|||||||
"NeptuneMissingConfiguration",
|
"NeptuneMissingConfiguration",
|
||||||
"SwanLabCallback",
|
"SwanLabCallback",
|
||||||
"TensorBoardCallback",
|
"TensorBoardCallback",
|
||||||
|
"TrackioCallback",
|
||||||
"WandbCallback",
|
"WandbCallback",
|
||||||
"get_available_reporting_integrations",
|
"get_available_reporting_integrations",
|
||||||
"get_reporting_integration_callbacks",
|
"get_reporting_integration_callbacks",
|
||||||
@@ -110,6 +111,7 @@ _import_structure = {
|
|||||||
"is_sigopt_available",
|
"is_sigopt_available",
|
||||||
"is_swanlab_available",
|
"is_swanlab_available",
|
||||||
"is_tensorboard_available",
|
"is_tensorboard_available",
|
||||||
|
"is_trackio_available",
|
||||||
"is_wandb_available",
|
"is_wandb_available",
|
||||||
"rewrite_logs",
|
"rewrite_logs",
|
||||||
"run_hp_search_optuna",
|
"run_hp_search_optuna",
|
||||||
@@ -224,6 +226,7 @@ if TYPE_CHECKING:
|
|||||||
NeptuneMissingConfiguration,
|
NeptuneMissingConfiguration,
|
||||||
SwanLabCallback,
|
SwanLabCallback,
|
||||||
TensorBoardCallback,
|
TensorBoardCallback,
|
||||||
|
TrackioCallback,
|
||||||
WandbCallback,
|
WandbCallback,
|
||||||
get_available_reporting_integrations,
|
get_available_reporting_integrations,
|
||||||
get_reporting_integration_callbacks,
|
get_reporting_integration_callbacks,
|
||||||
@@ -244,6 +247,7 @@ if TYPE_CHECKING:
|
|||||||
is_sigopt_available,
|
is_sigopt_available,
|
||||||
is_swanlab_available,
|
is_swanlab_available,
|
||||||
is_tensorboard_available,
|
is_tensorboard_available,
|
||||||
|
is_trackio_available,
|
||||||
is_wandb_available,
|
is_wandb_available,
|
||||||
rewrite_logs,
|
rewrite_logs,
|
||||||
run_hp_search_optuna,
|
run_hp_search_optuna,
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
|
||||||
# comet_ml requires to be imported before any ML frameworks
|
# comet_ml requires to be imported before any ML frameworks
|
||||||
_MIN_COMET_VERSION = "3.43.2"
|
_MIN_COMET_VERSION = "3.43.2"
|
||||||
@@ -111,6 +112,10 @@ def is_wandb_available():
|
|||||||
return importlib.util.find_spec("wandb") is not None
|
return importlib.util.find_spec("wandb") is not None
|
||||||
|
|
||||||
|
|
||||||
|
def is_trackio_available():
|
||||||
|
return importlib.util.find_spec("trackio") is not None
|
||||||
|
|
||||||
|
|
||||||
def is_clearml_available():
|
def is_clearml_available():
|
||||||
return importlib.util.find_spec("clearml") is not None
|
return importlib.util.find_spec("clearml") is not None
|
||||||
|
|
||||||
@@ -630,6 +635,8 @@ def get_available_reporting_integrations():
|
|||||||
integrations.append("clearml")
|
integrations.append("clearml")
|
||||||
if is_swanlab_available():
|
if is_swanlab_available():
|
||||||
integrations.append("swanlab")
|
integrations.append("swanlab")
|
||||||
|
if is_trackio_available():
|
||||||
|
integrations.append("trackio")
|
||||||
return integrations
|
return integrations
|
||||||
|
|
||||||
|
|
||||||
@@ -1033,6 +1040,115 @@ class WandbCallback(TrainerCallback):
|
|||||||
self._wandb.log(metrics)
|
self._wandb.log(metrics)
|
||||||
|
|
||||||
|
|
||||||
|
class TrackioCallback(TrainerCallback):
|
||||||
|
"""
|
||||||
|
A [`TrainerCallback`] that logs metrics to Trackio.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
has_trackio = is_trackio_available()
|
||||||
|
if not has_trackio:
|
||||||
|
raise RuntimeError("TrackioCallback requires trackio to be installed. Run `pip install trackio`.")
|
||||||
|
if has_trackio:
|
||||||
|
import trackio
|
||||||
|
|
||||||
|
self._trackio = trackio
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
def setup(self, args, state, model, **kwargs):
|
||||||
|
"""
|
||||||
|
Setup the optional Trackio integration.
|
||||||
|
|
||||||
|
To customize the setup you can also override the following environment variables:
|
||||||
|
|
||||||
|
Environment:
|
||||||
|
- **TRACKIO_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
|
||||||
|
The name of the project (can be an existing project to continue tracking or a new project to start tracking
|
||||||
|
from scratch).
|
||||||
|
- **TRACKIO_SPACE_ID** (`str`, *optional*, defaults to `None`):
|
||||||
|
If set, the project will be logged to a Hugging Face Space instead of a local directory. Should be a
|
||||||
|
complete Space name like `"username/reponame"` or `"orgname/reponame"`, or just `"reponame" in which case
|
||||||
|
the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not
|
||||||
|
exist, it will be created. If the Space already exists, the project will be logged to it.
|
||||||
|
"""
|
||||||
|
if state.is_world_process_zero:
|
||||||
|
combined_dict = {**args.to_dict()}
|
||||||
|
|
||||||
|
if hasattr(model, "config") and model.config is not None:
|
||||||
|
model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
|
||||||
|
combined_dict = {**model_config, **combined_dict}
|
||||||
|
if hasattr(model, "peft_config") and model.peft_config is not None:
|
||||||
|
peft_config = model.peft_config
|
||||||
|
combined_dict = {**{"peft_config": peft_config}, **combined_dict}
|
||||||
|
|
||||||
|
self._trackio.init(
|
||||||
|
project=os.getenv("TRACKIO_PROJECT", "huggingface"),
|
||||||
|
name=args.run_name,
|
||||||
|
space_id=os.getenv("TRACKIO_SPACE_ID", None),
|
||||||
|
resume="allow",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add config parameters (run may have been created manually)
|
||||||
|
self._trackio.config.update(combined_dict, allow_val_change=True)
|
||||||
|
|
||||||
|
# Add number of model parameters to trackio config
|
||||||
|
try:
|
||||||
|
self._trackio.config["model/num_parameters"] = model.num_parameters()
|
||||||
|
except AttributeError:
|
||||||
|
logger.info("Could not log the number of model parameters in Trackio due to an AttributeError.")
|
||||||
|
self._initialized = True
|
||||||
|
|
||||||
|
def on_train_begin(self, args, state, control, model=None, **kwargs):
|
||||||
|
if not self._initialized:
|
||||||
|
self.setup(args, state, model, **kwargs)
|
||||||
|
|
||||||
|
def on_train_end(self, args: TrainingArguments, state, control, model=None, processing_class=None, **kwargs):
|
||||||
|
if state.is_world_process_zero and self._initialized:
|
||||||
|
self._trackio.finish()
|
||||||
|
|
||||||
|
def on_log(self, args, state, control, model=None, logs=None, **kwargs):
|
||||||
|
single_value_scalars = [
|
||||||
|
"train_runtime",
|
||||||
|
"train_samples_per_second",
|
||||||
|
"train_steps_per_second",
|
||||||
|
"train_loss",
|
||||||
|
"total_flos",
|
||||||
|
]
|
||||||
|
|
||||||
|
if is_torch_available() and torch.cuda.is_available() and dist.is_available() and dist.is_initialized():
|
||||||
|
device_idx = torch.cuda.current_device()
|
||||||
|
total_memory = torch.cuda.get_device_properties(device_idx).total_memory
|
||||||
|
memory_allocated = torch.cuda.memory_allocated(device_idx)
|
||||||
|
|
||||||
|
gpu_memory_logs = {
|
||||||
|
f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB
|
||||||
|
f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio
|
||||||
|
}
|
||||||
|
|
||||||
|
gathered_logs = [None] * dist.get_world_size()
|
||||||
|
dist.all_gather_object(gathered_logs, gpu_memory_logs)
|
||||||
|
gpu_memory_logs = {k: v for d in gathered_logs for k, v in d.items()}
|
||||||
|
|
||||||
|
if not self._initialized:
|
||||||
|
self.setup(args, state, model)
|
||||||
|
if state.is_world_process_zero:
|
||||||
|
non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
|
||||||
|
non_scalar_logs = rewrite_logs(non_scalar_logs)
|
||||||
|
self._trackio.log({**non_scalar_logs, **gpu_memory_logs, "train/global_step": state.global_step})
|
||||||
|
|
||||||
|
def on_save(self, args, state, control, **kwargs):
|
||||||
|
return
|
||||||
|
|
||||||
|
def on_predict(self, args, state, control, metrics, **kwargs):
|
||||||
|
if self._trackio is None:
|
||||||
|
return
|
||||||
|
if not self._initialized:
|
||||||
|
self.setup(args, state, **kwargs)
|
||||||
|
if state.is_world_process_zero:
|
||||||
|
metrics = rewrite_logs(metrics)
|
||||||
|
self._trackio.log(metrics)
|
||||||
|
|
||||||
|
|
||||||
class CometCallback(TrainerCallback):
|
class CometCallback(TrainerCallback):
|
||||||
"""
|
"""
|
||||||
A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
|
A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
|
||||||
@@ -2329,6 +2445,7 @@ INTEGRATION_TO_CALLBACK = {
|
|||||||
"mlflow": MLflowCallback,
|
"mlflow": MLflowCallback,
|
||||||
"neptune": NeptuneCallback,
|
"neptune": NeptuneCallback,
|
||||||
"tensorboard": TensorBoardCallback,
|
"tensorboard": TensorBoardCallback,
|
||||||
|
"trackio": TrackioCallback,
|
||||||
"wandb": WandbCallback,
|
"wandb": WandbCallback,
|
||||||
"codecarbon": CodeCarbonCallback,
|
"codecarbon": CodeCarbonCallback,
|
||||||
"clearml": ClearMLCallback,
|
"clearml": ClearMLCallback,
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ from .integrations import (
|
|||||||
is_sigopt_available,
|
is_sigopt_available,
|
||||||
is_swanlab_available,
|
is_swanlab_available,
|
||||||
is_tensorboard_available,
|
is_tensorboard_available,
|
||||||
|
is_trackio_available,
|
||||||
is_wandb_available,
|
is_wandb_available,
|
||||||
)
|
)
|
||||||
from .integrations.deepspeed import is_deepspeed_available
|
from .integrations.deepspeed import is_deepspeed_available
|
||||||
@@ -1274,6 +1275,16 @@ def require_swanlab(test_case):
|
|||||||
return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
|
return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
def require_trackio(test_case):
|
||||||
|
"""
|
||||||
|
Decorator marking a test that requires trackio.
|
||||||
|
|
||||||
|
These tests are skipped when trackio isn't installed.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)
|
||||||
|
|
||||||
|
|
||||||
def require_wandb(test_case):
|
def require_wandb(test_case):
|
||||||
"""
|
"""
|
||||||
Decorator marking a test that requires wandb.
|
Decorator marking a test that requires wandb.
|
||||||
|
|||||||
@@ -438,9 +438,9 @@ class TrainingArguments:
|
|||||||
use the corresponding output (usually index 2) as the past state and feed it to the model at the next
|
use the corresponding output (usually index 2) as the past state and feed it to the model at the next
|
||||||
training step under the keyword argument `mems`.
|
training step under the keyword argument `mems`.
|
||||||
run_name (`str`, *optional*, defaults to `output_dir`):
|
run_name (`str`, *optional*, defaults to `output_dir`):
|
||||||
A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
|
A descriptor for the run. Typically used for [trackio](https://github.com/gradio-app/trackio),
|
||||||
[mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn)
|
[wandb](https://www.wandb.com/), [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and
|
||||||
logging. If not specified, will be the same as `output_dir`.
|
[swanlab](https://swanlab.cn) logging. If not specified, will be the same as `output_dir`.
|
||||||
disable_tqdm (`bool`, *optional*):
|
disable_tqdm (`bool`, *optional*):
|
||||||
Whether or not to disable the tqdm progress bars and table of metrics produced by
|
Whether or not to disable the tqdm progress bars and table of metrics produced by
|
||||||
[`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
|
[`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
|
||||||
@@ -626,8 +626,8 @@ class TrainingArguments:
|
|||||||
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
|
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
|
||||||
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
||||||
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
|
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
|
||||||
`"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
|
`"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
|
||||||
for no integrations.
|
installed, `"none"` for no integrations.
|
||||||
ddp_find_unused_parameters (`bool`, *optional*):
|
ddp_find_unused_parameters (`bool`, *optional*):
|
||||||
When using distributed training, the value of the flag `find_unused_parameters` passed to
|
When using distributed training, the value of the flag `find_unused_parameters` passed to
|
||||||
`DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
|
`DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
|
||||||
@@ -1182,7 +1182,10 @@ class TrainingArguments:
|
|||||||
run_name: Optional[str] = field(
|
run_name: Optional[str] = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging."
|
"help": (
|
||||||
|
"An optional descriptor for the run. Notably used for trackio, wandb, mlflow comet and swanlab "
|
||||||
|
"logging."
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
disable_tqdm: Optional[bool] = field(
|
disable_tqdm: Optional[bool] = field(
|
||||||
@@ -2838,8 +2841,8 @@ class TrainingArguments:
|
|||||||
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
|
report_to (`str` or `list[str]`, *optional*, defaults to `"all"`):
|
||||||
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
||||||
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
|
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
|
||||||
`"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations
|
`"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
|
||||||
installed, `"none"` for no integrations.
|
integrations installed, `"none"` for no integrations.
|
||||||
first_step (`bool`, *optional*, defaults to `False`):
|
first_step (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to log and evaluate the first `global_step` or not.
|
Whether to log and evaluate the first `global_step` or not.
|
||||||
nan_inf_filter (`bool`, *optional*, defaults to `True`):
|
nan_inf_filter (`bool`, *optional*, defaults to `True`):
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
|
|||||||
Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
|
Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
|
||||||
automatically detect from metadata.
|
automatically detect from metadata.
|
||||||
run_name (`str`, *optional*):
|
run_name (`str`, *optional*):
|
||||||
A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging.
|
A descriptor for the run. Notably used for trackio, wandb, mlflow, comet and swanlab logging.
|
||||||
xla (`bool`, *optional*):
|
xla (`bool`, *optional*):
|
||||||
Whether to activate the XLA compilation or not.
|
Whether to activate the XLA compilation or not.
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user