Save TB logs as part of push_to_hub (#27022)
* Support runs/ * Upload runs folder as part of push to hub * Add a test * Add to test deps * Update with proposed solution from Slack * Ensure that repo gets deleted in tests
This commit is contained in:
2
setup.py
2
setup.py
@@ -167,6 +167,7 @@ _deps = [
|
|||||||
"starlette",
|
"starlette",
|
||||||
"sudachipy>=0.6.6",
|
"sudachipy>=0.6.6",
|
||||||
"sudachidict_core>=20220729",
|
"sudachidict_core>=20220729",
|
||||||
|
"tensorboard",
|
||||||
# TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
|
# TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
|
||||||
"tensorflow-cpu>=2.6,<2.15",
|
"tensorflow-cpu>=2.6,<2.15",
|
||||||
"tensorflow>=2.6,<2.15",
|
"tensorflow>=2.6,<2.15",
|
||||||
@@ -319,6 +320,7 @@ extras["testing"] = (
|
|||||||
"sacremoses",
|
"sacremoses",
|
||||||
"rjieba",
|
"rjieba",
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
|
"tensorboard",
|
||||||
)
|
)
|
||||||
+ extras["retrieval"]
|
+ extras["retrieval"]
|
||||||
+ extras["modelcreation"]
|
+ extras["modelcreation"]
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ deps = {
|
|||||||
"starlette": "starlette",
|
"starlette": "starlette",
|
||||||
"sudachipy": "sudachipy>=0.6.6",
|
"sudachipy": "sudachipy>=0.6.6",
|
||||||
"sudachidict_core": "sudachidict_core>=20220729",
|
"sudachidict_core": "sudachidict_core>=20220729",
|
||||||
|
"tensorboard": "tensorboard",
|
||||||
"tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15",
|
"tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15",
|
||||||
"tensorflow": "tensorflow>=2.6,<2.15",
|
"tensorflow": "tensorflow>=2.6,<2.15",
|
||||||
"tensorflow-text": "tensorflow-text<2.15",
|
"tensorflow-text": "tensorflow-text<2.15",
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ from .integrations import (
|
|||||||
is_optuna_available,
|
is_optuna_available,
|
||||||
is_ray_available,
|
is_ray_available,
|
||||||
is_sigopt_available,
|
is_sigopt_available,
|
||||||
|
is_tensorboard_available,
|
||||||
is_wandb_available,
|
is_wandb_available,
|
||||||
)
|
)
|
||||||
from .integrations.deepspeed import is_deepspeed_available
|
from .integrations.deepspeed import is_deepspeed_available
|
||||||
@@ -911,6 +912,13 @@ def require_optimum(test_case):
|
|||||||
return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
|
return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
def require_tensorboard(test_case):
|
||||||
|
"""
|
||||||
|
Decorator for `tensorboard` dependency
|
||||||
|
"""
|
||||||
|
return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard")
|
||||||
|
|
||||||
|
|
||||||
def require_auto_gptq(test_case):
|
def require_auto_gptq(test_case):
|
||||||
"""
|
"""
|
||||||
Decorator for auto_gptq dependency
|
Decorator for auto_gptq dependency
|
||||||
|
|||||||
@@ -3560,7 +3560,7 @@ class Trainer:
|
|||||||
commit_message=commit_message,
|
commit_message=commit_message,
|
||||||
token=self.args.hub_token,
|
token=self.args.hub_token,
|
||||||
run_as_future=True,
|
run_as_future=True,
|
||||||
ignore_patterns=["_*", "**/*"],
|
ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
push_jobs = [model_push_job]
|
push_jobs = [model_push_job]
|
||||||
@@ -3630,14 +3630,13 @@ class Trainer:
|
|||||||
|
|
||||||
# Wait for the current upload to be finished.
|
# Wait for the current upload to be finished.
|
||||||
self._finish_current_push()
|
self._finish_current_push()
|
||||||
|
|
||||||
return upload_folder(
|
return upload_folder(
|
||||||
repo_id=self.hub_model_id,
|
repo_id=self.hub_model_id,
|
||||||
folder_path=self.args.output_dir,
|
folder_path=self.args.output_dir,
|
||||||
commit_message=commit_message,
|
commit_message=commit_message,
|
||||||
token=self.args.hub_token,
|
token=self.args.hub_token,
|
||||||
run_as_future=not blocking,
|
run_as_future=not blocking,
|
||||||
ignore_patterns=["_*", "**/*"],
|
ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ from typing import Dict, List
|
|||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from huggingface_hub import HfFolder, delete_repo, list_repo_commits
|
from huggingface_hub import HfFolder, delete_repo, list_repo_commits, list_repo_files
|
||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
@@ -60,6 +60,7 @@ from transformers.testing_utils import (
|
|||||||
require_safetensors,
|
require_safetensors,
|
||||||
require_sentencepiece,
|
require_sentencepiece,
|
||||||
require_sigopt,
|
require_sigopt,
|
||||||
|
require_tensorboard,
|
||||||
require_tokenizers,
|
require_tokenizers,
|
||||||
require_torch,
|
require_torch,
|
||||||
require_torch_bf16_cpu,
|
require_torch_bf16_cpu,
|
||||||
@@ -138,11 +139,14 @@ class RegressionDataset:
|
|||||||
class RegressionTrainingArguments(TrainingArguments):
|
class RegressionTrainingArguments(TrainingArguments):
|
||||||
a: float = 0.0
|
a: float = 0.0
|
||||||
b: float = 0.0
|
b: float = 0.0
|
||||||
|
keep_report_to: bool = False
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
super().__post_init__()
|
super().__post_init__()
|
||||||
# save resources not dealing with reporting (also avoids the warning when it's not set)
|
# save resources not dealing with reporting unless specified (also avoids the warning when it's not set)
|
||||||
self.report_to = []
|
# can be explicitly disabled via `keep_report_to`
|
||||||
|
if not self.keep_report_to:
|
||||||
|
self.report_to = []
|
||||||
|
|
||||||
|
|
||||||
class RepeatDataset:
|
class RepeatDataset:
|
||||||
@@ -319,7 +323,9 @@ if is_torch_available():
|
|||||||
h = nn.functional.relu(self.linear2(x))
|
h = nn.functional.relu(self.linear2(x))
|
||||||
return self.ln2(x + h + self.bias)
|
return self.ln2(x + h + self.bias)
|
||||||
|
|
||||||
def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, **kwargs):
|
def get_regression_trainer(
|
||||||
|
a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs
|
||||||
|
):
|
||||||
label_names = kwargs.get("label_names", None)
|
label_names = kwargs.get("label_names", None)
|
||||||
train_dataset = RegressionDataset(length=train_len, label_names=label_names)
|
train_dataset = RegressionDataset(length=train_len, label_names=label_names)
|
||||||
eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
|
eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
|
||||||
@@ -340,7 +346,7 @@ if is_torch_available():
|
|||||||
output_dir = kwargs.pop("output_dir", "./regression")
|
output_dir = kwargs.pop("output_dir", "./regression")
|
||||||
preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None)
|
preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None)
|
||||||
|
|
||||||
args = RegressionTrainingArguments(output_dir, a=a, b=b, **kwargs)
|
args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs)
|
||||||
return Trainer(
|
return Trainer(
|
||||||
model,
|
model,
|
||||||
args,
|
args,
|
||||||
@@ -2155,7 +2161,7 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step"]:
|
for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step", "test-trainer-tensorboard"]:
|
||||||
try:
|
try:
|
||||||
delete_repo(token=cls._token, repo_id=model)
|
delete_repo(token=cls._token, repo_id=model)
|
||||||
except HTTPError:
|
except HTTPError:
|
||||||
@@ -2264,6 +2270,28 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
|
|||||||
for i in range(5, max_steps, 5):
|
for i in range(5, max_steps, 5):
|
||||||
self.assertIn(f"Training in progress, step {i}", commits)
|
self.assertIn(f"Training in progress, step {i}", commits)
|
||||||
|
|
||||||
|
@require_tensorboard
|
||||||
|
def test_push_to_hub_with_tensorboard_logs(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
trainer = get_regression_trainer(
|
||||||
|
output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"),
|
||||||
|
hub_token=self._token,
|
||||||
|
save_strategy="epoch",
|
||||||
|
report_to=["tensorboard"],
|
||||||
|
keep_report_to=True,
|
||||||
|
)
|
||||||
|
trainer.train()
|
||||||
|
# Push the runs via `push_to_hub()`
|
||||||
|
trainer.push_to_hub()
|
||||||
|
|
||||||
|
files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token)
|
||||||
|
found_log = False
|
||||||
|
for f in files:
|
||||||
|
if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
|
||||||
|
found_log = True
|
||||||
|
|
||||||
|
assert found_log is True, "No tensorboard log found in repo"
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_optuna
|
@require_optuna
|
||||||
|
|||||||
Reference in New Issue
Block a user