From 777b1bfe6236ca7034d5eb6358e37c2345101886 Mon Sep 17 00:00:00 2001 From: Muhammad Sakib Khan Inan Date: Tue, 15 Nov 2022 21:08:59 +0600 Subject: [PATCH] New logging support to "Trainer" Class (ClearML Logger) (#20184) * Init Update * ClearML Callbacks integration * update corrections * args reporting updated * {'tensorboard': False, 'pytorch': False} * ClearML Tests added * add clearml * output_uri=True in Task.init * reformatted integrations.py * reformatted and fixed * IF-ELSE statement issue on "has_clearml" resolved * Add clearml in main callback docs * Add additional clearml documentation * Update src/transformers/integrations.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Accept suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Accept suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Small change in comments * Make style clearml * Accept suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Victor Sonck Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/en/main_classes/callback.mdx | 3 + examples/pytorch/README.md | 38 ++++++ .../run_image_classification_no_trainer.py | 2 +- .../language-modeling/run_clm_no_trainer.py | 2 +- .../language-modeling/run_mlm_no_trainer.py | 2 +- .../multiple-choice/run_swag_no_trainer.py | 2 +- .../question-answering/run_qa_no_trainer.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../run_summarization_no_trainer.py | 2 +- .../run_glue_no_trainer.py | 2 +- .../run_ner_no_trainer.py | 2 +- .../translation/run_translation_no_trainer.py | 2 +- src/transformers/__init__.py | 2 + src/transformers/integrations.py | 114 ++++++++++++++++++ src/transformers/testing_utils.py | 11 ++ src/transformers/training_args.py | 4 +- 16 files changed, 180 insertions(+), 12 deletions(-) diff --git a/docs/source/en/main_classes/callback.mdx b/docs/source/en/main_classes/callback.mdx index 7c5b48f5d4..f591f80d25 100644 --- a/docs/source/en/main_classes/callback.mdx +++ b/docs/source/en/main_classes/callback.mdx @@ -37,6 +37,7 @@ By default a [`Trainer`] will use the following callbacks: installed. - [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is installed. +- [`~integrations.ClearMLCallback`] if [clearml](https://github.com/allegroai/clearml) is installed. The main class that implements callbacks is [`TrainerCallback`]. It gets the [`TrainingArguments`] used to instantiate the [`Trainer`], can access that @@ -73,6 +74,8 @@ Here is the list of the available [`TrainerCallback`] in the library: [[autodoc]] integrations.NeptuneCallback +[[autodoc]] integrations.ClearMLCallback + ## TrainerCallback [[autodoc]] TrainerCallback diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md index b0b099bae2..e1c6c01c0d 100644 --- a/examples/pytorch/README.md +++ b/examples/pytorch/README.md @@ -199,6 +199,7 @@ You can easily log and monitor your runs code. The following are currently suppo * [Weights & Biases](https://docs.wandb.ai/integrations/huggingface) * [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/) * [Neptune](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face) +* [ClearML](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps) ### Weights & Biases @@ -335,3 +336,40 @@ Now, when you start the training with `trainer.train()`, your metadata will be l | `NEPTUNE_PROJECT` | The full name of your Neptune project (`workspace-name/project-name`). To find and copy it, head to **project settings** → **Properties**. | For detailed instructions and examples, see the [Neptune docs](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face). + +### ClearML + +To use ClearML, install the clearml package with: + +```bash +pip install clearml +``` + +Then [create new credentials]() from the ClearML Server. You can get a free hosted server [here]() or [self-host your own]()! +After creating your new credentials, you can either copy the local snippet which you can paste after running: + +```bash +clearml-init +``` + +Or you can copy the jupyter snippet if you are in Jupyter or Colab: + +```python +%env CLEARML_WEB_HOST=https://app.clear.ml +%env CLEARML_API_HOST=https://api.clear.ml +%env CLEARML_FILES_HOST=https://files.clear.ml +%env CLEARML_API_ACCESS_KEY=*** +%env CLEARML_API_SECRET_KEY=*** +``` + + +To enable logging to ClearML, include `"clearml"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to all` if you have `clearml` already installed. + +Advanced configuration is possible by setting environment variables: + +| Environment Variable | Value | +|---|---| +| CLEARML_PROJECT | Name of the project in ClearML. (default: `"HuggingFace Transformers"`) | +| CLEARML_TASK | Name of the task in ClearML. (default: `"Trainer"`) | + +Additional configuration options are available through generic [clearml environment variables](https://clear.ml/docs/latest/docs/configs/env_vars). \ No newline at end of file diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 4ce684d981..4f0b8304b7 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -175,7 +175,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index c62b895249..77785b711e 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -216,7 +216,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 4a2d5490e1..fafa21c4ca 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -223,7 +223,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 607328f99d..1093e92bd5 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -205,7 +205,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 85ef6f6a11..e4276faade 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -296,7 +296,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 49d6eac687..ce2e58b85a 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -297,7 +297,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 6e79b2f6c2..786af5d91a 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -298,7 +298,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 8696b5c956..2aacc4d2de 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -179,7 +179,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 0f7f9ba291..02bc773fed 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -232,7 +232,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index eefd00b686..5d9376fb0d 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -281,7 +281,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b5d9b24761..9fa910fda8 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -99,6 +99,7 @@ _import_structure = { "generation": [], "hf_argparser": ["HfArgumentParser"], "integrations": [ + "is_clearml_available", "is_comet_available", "is_neptune_available", "is_optuna_available", @@ -3239,6 +3240,7 @@ if TYPE_CHECKING: # Integrations from .integrations import ( + is_clearml_available, is_comet_available, is_neptune_available, is_optuna_available, diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index 94815d839e..73abadf11e 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -75,6 +75,10 @@ def is_wandb_available(): return importlib.util.find_spec("wandb") is not None +def is_clearml_available(): + return importlib.util.find_spec("clearml") is not None + + def is_comet_available(): return _has_comet @@ -528,6 +532,8 @@ def get_available_reporting_integrations(): integrations.append("wandb") if is_codecarbon_available(): integrations.append("codecarbon") + if is_clearml_available(): + integrations.append("clearml") return integrations @@ -1299,6 +1305,112 @@ class CodeCarbonCallback(TrainerCallback): self.tracker.stop() +class ClearMLCallback(TrainerCallback): + """ + A [`TrainerCallback`] that sends the logs to [ClearML](https://clear.ml/). + + Environment: + CLEARML_PROJECT (`str`, *optional*, defaults to `"HuggingFace Transformers"`): + ClearML project name. + CLEARML_TASK (`str`, *optional* defaults to `"Trainer"`): + ClearML task name. + """ + + def __init__(self): + if is_clearml_available(): + import clearml + + self._clearml = clearml + else: + raise RuntimeError("ClearMLCallback requires 'clearml' to be installed. Run `pip install clearml`.") + + self._initialized = False + self._clearml_task = None + + def setup(self, args, state, model, tokenizer, **kwargs): + if self._clearml is None: + return + if state.is_world_process_zero: + logger.info("Automatic ClearML logging enabled.") + if self._clearml_task is None: + self._clearml_task = self._clearml.Task.init( + project_name=os.getenv("CLEARML_PROJECT", "HuggingFace Transformers"), + task_name=os.getenv("CLEARML_TASK", "Trainer"), + auto_connect_frameworks={"tensorboard": False, "pytorch": False}, + output_uri=True, + ) + self._initialized = True + logger.info("ClearML Task has been initialized.") + + self._clearml_task.connect(args, "Args") + if hasattr(model, "config") and model.config is not None: + self._clearml_task.connect(model.config, "Model Configuration") + + def on_train_begin(self, args, state, control, model=None, tokenizer=None, **kwargs): + if self._clearml is None: + return + if state.is_hyper_param_search: + self._initialized = False + if not self._initialized: + self.setup(args, state, model, tokenizer, **kwargs) + + def on_train_end(self, args, state, control, model=None, tokenizer=None, metrics=None, logs=None, **kwargs): + if self._clearml is None: + return + if self._clearml_task and state.is_world_process_zero: + # Close ClearML Task at the end end of training + self._clearml_task.close() + + def on_log(self, args, state, control, model=None, tokenizer=None, logs=None, **kwargs): + if self._clearml is None: + return + if not self._initialized: + self.setup(args, state, model, tokenizer, **kwargs) + if state.is_world_process_zero: + eval_prefix = "eval_" + eval_prefix_len = len(eval_prefix) + test_prefix = "test_" + test_prefix_len = len(test_prefix) + single_value_scalars = [ + "train_runtime", + "train_samples_per_second", + "train_steps_per_second", + "train_loss", + "total_flos", + "epoch", + ] + for k, v in logs.items(): + if isinstance(v, (int, float)): + if k in single_value_scalars: + self._clearml_task.get_logger().report_single_value(name=k, value=v) + elif k.startswith(eval_prefix): + self._clearml_task.get_logger().report_scalar( + title=k[eval_prefix_len:], series="eval", value=v, iteration=state.global_step + ) + elif k.startswith(test_prefix): + self._clearml_task.get_logger().report_scalar( + title=k[test_prefix_len:], series="test", value=v, iteration=state.global_step + ) + else: + self._clearml_task.get_logger().report_scalar( + title=k, series="train", value=v, iteration=state.global_step + ) + else: + logger.warning( + "Trainer is attempting to log a value of " + f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' + "This invocation of ClearML logger's report_scalar() " + "is incorrect so we dropped this attribute." + ) + + def on_save(self, args, state, control, **kwargs): + if self._clearml_task and state.is_world_process_zero: + ckpt_dir = f"checkpoint-{state.global_step}" + artifact_path = os.path.join(args.output_dir, ckpt_dir) + logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.") + self._clearml_task.update_output_model(artifact_path, iteration=state.global_step, auto_delete_file=False) + + INTEGRATION_TO_CALLBACK = { "azure_ml": AzureMLCallback, "comet_ml": CometCallback, @@ -1307,6 +1419,7 @@ INTEGRATION_TO_CALLBACK = { "tensorboard": TensorBoardCallback, "wandb": WandbCallback, "codecarbon": CodeCarbonCallback, + "clearml": ClearMLCallback, } @@ -1316,4 +1429,5 @@ def get_reporting_integration_callbacks(report_to): raise ValueError( f"{integration} is not supported, only {', '.join(INTEGRATION_TO_CALLBACK.keys())} are supported." ) + return [INTEGRATION_TO_CALLBACK[integration] for integration in report_to] diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index eb69e7d241..eba7062840 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -39,6 +39,7 @@ from transformers import logging as transformers_logging from .deepspeed import is_deepspeed_available from .integrations import ( + is_clearml_available, is_fairscale_available, is_optuna_available, is_ray_available, @@ -579,6 +580,16 @@ def require_wandb(test_case): return unittest.skipUnless(is_wandb_available(), "test requires wandb")(test_case) +def require_clearml(test_case): + """ + Decorator marking a test requires clearml. + + These tests are skipped when clearml isn't installed. + + """ + return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) + + def require_soundfile(test_case): """ Decorator marking a test that requires soundfile diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index fc5ace7526..0c3af0ae6f 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -413,8 +413,8 @@ class TrainingArguments: instance of `Dataset`. report_to (`str` or `List[str]`, *optional*, defaults to `"all"`): The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, - `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"` and `"wandb"`. Use `"all"` to report to all - integrations installed, `"none"` for no integrations. + `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. Use `"all"` to report to + all integrations installed, `"none"` for no integrations. ddp_find_unused_parameters (`bool`, *optional*): When using distributed training, the value of the flag `find_unused_parameters` passed to `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.