diff --git a/setup.py b/setup.py index 6f43fae9ca..22407701f3 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ _deps = [ "starlette", "sudachipy>=0.6.6", "sudachidict_core>=20220729", + "tensorboard", # TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly "tensorflow-cpu>=2.6,<2.15", "tensorflow>=2.6,<2.15", @@ -319,6 +320,7 @@ extras["testing"] = ( "sacremoses", "rjieba", "beautifulsoup4", + "tensorboard", ) + extras["retrieval"] + extras["modelcreation"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 1dbedc3ea6..398e787684 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -73,6 +73,7 @@ deps = { "starlette": "starlette", "sudachipy": "sudachipy>=0.6.6", "sudachidict_core": "sudachidict_core>=20220729", + "tensorboard": "tensorboard", "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15", "tensorflow": "tensorflow>=2.6,<2.15", "tensorflow-text": "tensorflow-text<2.15", diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index a5cac8ddfe..3a3adbece0 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -45,6 +45,7 @@ from .integrations import ( is_optuna_available, is_ray_available, is_sigopt_available, + is_tensorboard_available, is_wandb_available, ) from .integrations.deepspeed import is_deepspeed_available @@ -911,6 +912,13 @@ def require_optimum(test_case): return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case) +def require_tensorboard(test_case): + """ + Decorator for `tensorboard` dependency + """ + return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard") + + def require_auto_gptq(test_case): """ Decorator for auto_gptq dependency diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1801dd01f6..25941ff0c7 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3560,7 +3560,7 @@ class Trainer: commit_message=commit_message, token=self.args.hub_token, run_as_future=True, - ignore_patterns=["_*", "**/*"], + ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"], ) push_jobs = [model_push_job] @@ -3630,14 +3630,13 @@ class Trainer: # Wait for the current upload to be finished. self._finish_current_push() - return upload_folder( repo_id=self.hub_model_id, folder_path=self.args.output_dir, commit_message=commit_message, token=self.args.hub_token, run_as_future=not blocking, - ignore_patterns=["_*", "**/*"], + ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"], ) # diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9d19aecd5e..6400852e62 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -30,7 +30,7 @@ from typing import Dict, List from unittest.mock import Mock, patch import numpy as np -from huggingface_hub import HfFolder, delete_repo, list_repo_commits +from huggingface_hub import HfFolder, delete_repo, list_repo_commits, list_repo_files from parameterized import parameterized from requests.exceptions import HTTPError @@ -60,6 +60,7 @@ from transformers.testing_utils import ( require_safetensors, require_sentencepiece, require_sigopt, + require_tensorboard, require_tokenizers, require_torch, require_torch_bf16_cpu, @@ -138,11 +139,14 @@ class RegressionDataset: class RegressionTrainingArguments(TrainingArguments): a: float = 0.0 b: float = 0.0 + keep_report_to: bool = False def __post_init__(self): super().__post_init__() - # save resources not dealing with reporting (also avoids the warning when it's not set) - self.report_to = [] + # save resources not dealing with reporting unless specified (also avoids the warning when it's not set) + # can be explicitly disabled via `keep_report_to` + if not self.keep_report_to: + self.report_to = [] class RepeatDataset: @@ -319,7 +323,9 @@ if is_torch_available(): h = nn.functional.relu(self.linear2(x)) return self.ln2(x + h + self.bias) - def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, **kwargs): + def get_regression_trainer( + a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs + ): label_names = kwargs.get("label_names", None) train_dataset = RegressionDataset(length=train_len, label_names=label_names) eval_dataset = RegressionDataset(length=eval_len, label_names=label_names) @@ -340,7 +346,7 @@ if is_torch_available(): output_dir = kwargs.pop("output_dir", "./regression") preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None) - args = RegressionTrainingArguments(output_dir, a=a, b=b, **kwargs) + args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs) return Trainer( model, args, @@ -2155,7 +2161,7 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): @classmethod def tearDownClass(cls): - for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step"]: + for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step", "test-trainer-tensorboard"]: try: delete_repo(token=cls._token, repo_id=model) except HTTPError: @@ -2264,6 +2270,28 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): for i in range(5, max_steps, 5): self.assertIn(f"Training in progress, step {i}", commits) + @require_tensorboard + def test_push_to_hub_with_tensorboard_logs(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"), + hub_token=self._token, + save_strategy="epoch", + report_to=["tensorboard"], + keep_report_to=True, + ) + trainer.train() + # Push the runs via `push_to_hub()` + trainer.push_to_hub() + + files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token) + found_log = False + for f in files: + if len(f.split("runs")) > 1 and "events.out.tfevents" in f: + found_log = True + + assert found_log is True, "No tensorboard log found in repo" + @require_torch @require_optuna