From 777b1bfe6236ca7034d5eb6358e37c2345101886 Mon Sep 17 00:00:00 2001
From: Muhammad Sakib Khan Inan <sakib.khaninan@gmail.com>
Date: Tue, 15 Nov 2022 21:08:59 +0600
Subject: [PATCH] New logging support to "Trainer" Class (ClearML Logger)
 (#20184)

* Init Update

* ClearML Callbacks integration

* update corrections

* args reporting updated

* {'tensorboard': False, 'pytorch': False}

* ClearML Tests added

* add clearml

* output_uri=True in Task.init

* reformatted integrations.py

* reformatted and fixed

* IF-ELSE statement issue on "has_clearml" resolved

* Add clearml in main callback docs

* Add additional clearml documentation

* Update src/transformers/integrations.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Accept suggestion

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Accept suggestion

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Small change in comments

* Make style clearml

* Accept suggestion

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Victor Sonck <victor.sonck@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/main_classes/callback.mdx      |   3 +
 examples/pytorch/README.md                    |  38 ++++++
 .../run_image_classification_no_trainer.py    |   2 +-
 .../language-modeling/run_clm_no_trainer.py   |   2 +-
 .../language-modeling/run_mlm_no_trainer.py   |   2 +-
 .../multiple-choice/run_swag_no_trainer.py    |   2 +-
 .../question-answering/run_qa_no_trainer.py   |   2 +-
 .../run_semantic_segmentation_no_trainer.py   |   2 +-
 .../run_summarization_no_trainer.py           |   2 +-
 .../run_glue_no_trainer.py                    |   2 +-
 .../run_ner_no_trainer.py                     |   2 +-
 .../translation/run_translation_no_trainer.py |   2 +-
 src/transformers/__init__.py                  |   2 +
 src/transformers/integrations.py              | 114 ++++++++++++++++++
 src/transformers/testing_utils.py             |  11 ++
 src/transformers/training_args.py             |   4 +-
 16 files changed, 180 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/main_classes/callback.mdx b/docs/source/en/main_classes/callback.mdx
index 7c5b48f5d4..f591f80d25 100644
--- a/docs/source/en/main_classes/callback.mdx
+++ b/docs/source/en/main_classes/callback.mdx
@@ -37,6 +37,7 @@ By default a [`Trainer`] will use the following callbacks:
   installed.
 - [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
   installed.
+- [`~integrations.ClearMLCallback`] if [clearml](https://github.com/allegroai/clearml) is installed.
 
 The main class that implements callbacks is [`TrainerCallback`]. It gets the
 [`TrainingArguments`] used to instantiate the [`Trainer`], can access that
@@ -73,6 +74,8 @@ Here is the list of the available [`TrainerCallback`] in the library:
 
 [[autodoc]] integrations.NeptuneCallback
 
+[[autodoc]] integrations.ClearMLCallback
+
 ## TrainerCallback
 
 [[autodoc]] TrainerCallback
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index b0b099bae2..e1c6c01c0d 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -199,6 +199,7 @@ You can easily log and monitor your runs code. The following are currently suppo
 * [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
 * [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)
 * [Neptune](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face)
+* [ClearML](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps)
 
 ### Weights & Biases
 
@@ -335,3 +336,40 @@ Now, when you start the training with `trainer.train()`, your metadata will be l
 | `NEPTUNE_PROJECT` | The full name of your Neptune project (`workspace-name/project-name`). To find and copy it, head to **project settings** &rarr; **Properties**. |
 
 For detailed instructions and examples, see the [Neptune docs](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face).
+
+### ClearML
+
+To use ClearML, install the clearml package with:
+
+```bash
+pip install clearml
+```
+
+Then [create new credentials]() from the ClearML Server. You can get a free hosted server [here]() or [self-host your own]()!
+After creating your new credentials, you can either copy the local snippet which you can paste after running:
+
+```bash
+clearml-init
+```
+
+Or you can copy the jupyter snippet if you are in Jupyter or Colab:
+
+```python
+%env CLEARML_WEB_HOST=https://app.clear.ml
+%env CLEARML_API_HOST=https://api.clear.ml
+%env CLEARML_FILES_HOST=https://files.clear.ml
+%env CLEARML_API_ACCESS_KEY=***
+%env CLEARML_API_SECRET_KEY=***
+```
+
+
+To enable logging to ClearML, include `"clearml"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to all` if you have `clearml` already installed.
+
+Advanced configuration is possible by setting environment variables:
+
+| Environment Variable | Value |
+|---|---|
+| CLEARML_PROJECT    | Name of the project in ClearML. (default: `"HuggingFace Transformers"`) |
+| CLEARML_TASK       | Name of the task in ClearML. (default: `"Trainer"`) |
+
+Additional configuration options are available through generic [clearml environment variables](https://clear.ml/docs/latest/docs/configs/env_vars).
\ No newline at end of file
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 4ce684d981..4f0b8304b7 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -175,7 +175,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index c62b895249..77785b711e 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -216,7 +216,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 4a2d5490e1..fafa21c4ca 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -223,7 +223,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 607328f99d..1093e92bd5 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -205,7 +205,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 85ef6f6a11..e4276faade 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -296,7 +296,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 49d6eac687..ce2e58b85a 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -297,7 +297,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 6e79b2f6c2..786af5d91a 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -298,7 +298,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 8696b5c956..2aacc4d2de 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -179,7 +179,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 0f7f9ba291..02bc773fed 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -232,7 +232,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index eefd00b686..5d9376fb0d 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -281,7 +281,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b5d9b24761..9fa910fda8 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -99,6 +99,7 @@ _import_structure = {
     "generation": [],
     "hf_argparser": ["HfArgumentParser"],
     "integrations": [
+        "is_clearml_available",
         "is_comet_available",
         "is_neptune_available",
         "is_optuna_available",
@@ -3239,6 +3240,7 @@ if TYPE_CHECKING:
 
     # Integrations
     from .integrations import (
+        is_clearml_available,
         is_comet_available,
         is_neptune_available,
         is_optuna_available,
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 94815d839e..73abadf11e 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -75,6 +75,10 @@ def is_wandb_available():
     return importlib.util.find_spec("wandb") is not None
 
 
+def is_clearml_available():
+    return importlib.util.find_spec("clearml") is not None
+
+
 def is_comet_available():
     return _has_comet
 
@@ -528,6 +532,8 @@ def get_available_reporting_integrations():
         integrations.append("wandb")
     if is_codecarbon_available():
         integrations.append("codecarbon")
+    if is_clearml_available():
+        integrations.append("clearml")
     return integrations
 
 
@@ -1299,6 +1305,112 @@ class CodeCarbonCallback(TrainerCallback):
             self.tracker.stop()
 
 
+class ClearMLCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [ClearML](https://clear.ml/).
+
+    Environment:
+            CLEARML_PROJECT (`str`, *optional*, defaults to `"HuggingFace Transformers"`):
+                ClearML project name.
+            CLEARML_TASK (`str`, *optional* defaults to `"Trainer"`):
+                ClearML task name.
+    """
+
+    def __init__(self):
+        if is_clearml_available():
+            import clearml
+
+            self._clearml = clearml
+        else:
+            raise RuntimeError("ClearMLCallback requires 'clearml' to be installed. Run `pip install clearml`.")
+
+        self._initialized = False
+        self._clearml_task = None
+
+    def setup(self, args, state, model, tokenizer, **kwargs):
+        if self._clearml is None:
+            return
+        if state.is_world_process_zero:
+            logger.info("Automatic ClearML logging enabled.")
+            if self._clearml_task is None:
+                self._clearml_task = self._clearml.Task.init(
+                    project_name=os.getenv("CLEARML_PROJECT", "HuggingFace Transformers"),
+                    task_name=os.getenv("CLEARML_TASK", "Trainer"),
+                    auto_connect_frameworks={"tensorboard": False, "pytorch": False},
+                    output_uri=True,
+                )
+                self._initialized = True
+                logger.info("ClearML Task has been initialized.")
+
+            self._clearml_task.connect(args, "Args")
+            if hasattr(model, "config") and model.config is not None:
+                self._clearml_task.connect(model.config, "Model Configuration")
+
+    def on_train_begin(self, args, state, control, model=None, tokenizer=None, **kwargs):
+        if self._clearml is None:
+            return
+        if state.is_hyper_param_search:
+            self._initialized = False
+        if not self._initialized:
+            self.setup(args, state, model, tokenizer, **kwargs)
+
+    def on_train_end(self, args, state, control, model=None, tokenizer=None, metrics=None, logs=None, **kwargs):
+        if self._clearml is None:
+            return
+        if self._clearml_task and state.is_world_process_zero:
+            # Close ClearML Task at the end end of training
+            self._clearml_task.close()
+
+    def on_log(self, args, state, control, model=None, tokenizer=None, logs=None, **kwargs):
+        if self._clearml is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model, tokenizer, **kwargs)
+        if state.is_world_process_zero:
+            eval_prefix = "eval_"
+            eval_prefix_len = len(eval_prefix)
+            test_prefix = "test_"
+            test_prefix_len = len(test_prefix)
+            single_value_scalars = [
+                "train_runtime",
+                "train_samples_per_second",
+                "train_steps_per_second",
+                "train_loss",
+                "total_flos",
+                "epoch",
+            ]
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    if k in single_value_scalars:
+                        self._clearml_task.get_logger().report_single_value(name=k, value=v)
+                    elif k.startswith(eval_prefix):
+                        self._clearml_task.get_logger().report_scalar(
+                            title=k[eval_prefix_len:], series="eval", value=v, iteration=state.global_step
+                        )
+                    elif k.startswith(test_prefix):
+                        self._clearml_task.get_logger().report_scalar(
+                            title=k[test_prefix_len:], series="test", value=v, iteration=state.global_step
+                        )
+                    else:
+                        self._clearml_task.get_logger().report_scalar(
+                            title=k, series="train", value=v, iteration=state.global_step
+                        )
+                else:
+                    logger.warning(
+                        "Trainer is attempting to log a value of "
+                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                        "This invocation of ClearML logger's  report_scalar() "
+                        "is incorrect so we dropped this attribute."
+                    )
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._clearml_task and state.is_world_process_zero:
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.")
+            self._clearml_task.update_output_model(artifact_path, iteration=state.global_step, auto_delete_file=False)
+
+
 INTEGRATION_TO_CALLBACK = {
     "azure_ml": AzureMLCallback,
     "comet_ml": CometCallback,
@@ -1307,6 +1419,7 @@ INTEGRATION_TO_CALLBACK = {
     "tensorboard": TensorBoardCallback,
     "wandb": WandbCallback,
     "codecarbon": CodeCarbonCallback,
+    "clearml": ClearMLCallback,
 }
 
 
@@ -1316,4 +1429,5 @@ def get_reporting_integration_callbacks(report_to):
             raise ValueError(
                 f"{integration} is not supported, only {', '.join(INTEGRATION_TO_CALLBACK.keys())} are supported."
             )
+
     return [INTEGRATION_TO_CALLBACK[integration] for integration in report_to]
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index eb69e7d241..eba7062840 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -39,6 +39,7 @@ from transformers import logging as transformers_logging
 
 from .deepspeed import is_deepspeed_available
 from .integrations import (
+    is_clearml_available,
     is_fairscale_available,
     is_optuna_available,
     is_ray_available,
@@ -579,6 +580,16 @@ def require_wandb(test_case):
     return unittest.skipUnless(is_wandb_available(), "test requires wandb")(test_case)
 
 
+def require_clearml(test_case):
+    """
+    Decorator marking a test requires clearml.
+
+    These tests are skipped when clearml isn't installed.
+
+    """
+    return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
+
+
 def require_soundfile(test_case):
     """
     Decorator marking a test that requires soundfile
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index fc5ace7526..0c3af0ae6f 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -413,8 +413,8 @@ class TrainingArguments:
             instance of `Dataset`.
         report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
-            `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"` and `"wandb"`. Use `"all"` to report to all
-            integrations installed, `"none"` for no integrations.
+            `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. Use `"all"` to report to
+            all integrations installed, `"none"` for no integrations.
         ddp_find_unused_parameters (`bool`, *optional*):
             When using distributed training, the value of the flag `find_unused_parameters` passed to
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.