Fix flaky Hub CI (test_trainer.py) (#35062)
* fix * Update src/transformers/testing_utils.py Co-authored-by: Lucain <lucainp@gmail.com> * fix * fix * fix * fix * fix * fix * fix * fix * check * check * check * check * check * check * Update src/transformers/testing_utils.py Co-authored-by: Lucain <lucainp@gmail.com> * Update src/transformers/testing_utils.py Co-authored-by: Lucain <lucainp@gmail.com> * check * check * check * Final space * Final adjustment --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Lucain <lucainp@gmail.com>
This commit is contained in:
@@ -32,10 +32,9 @@ from typing import Dict, List
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import numpy as np
|
||||
from huggingface_hub import HfFolder, ModelCard, create_branch, delete_repo, list_repo_commits, list_repo_files
|
||||
from huggingface_hub import HfFolder, ModelCard, create_branch, list_repo_commits, list_repo_files
|
||||
from packaging import version
|
||||
from parameterized import parameterized
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from transformers import (
|
||||
AutoFeatureExtractor,
|
||||
@@ -59,6 +58,7 @@ from transformers.testing_utils import (
|
||||
USER,
|
||||
CaptureLogger,
|
||||
LoggingLevel,
|
||||
TemporaryHubRepo,
|
||||
TestCasePlus,
|
||||
backend_device_count,
|
||||
execute_subprocess_async,
|
||||
@@ -4152,64 +4152,49 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
|
||||
cls._token = TOKEN
|
||||
HfFolder.save_token(TOKEN)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
for model in [
|
||||
"test-trainer",
|
||||
"test-trainer-epoch",
|
||||
"test-trainer-step",
|
||||
"test-trainer-tensorboard",
|
||||
"test-trainer-tags",
|
||||
]:
|
||||
try:
|
||||
delete_repo(token=cls._token, repo_id=model)
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
try:
|
||||
delete_repo(token=cls._token, repo_id="valid_org/test-trainer-org")
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
def test_push_to_hub(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer"),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
)
|
||||
url = trainer.push_to_hub()
|
||||
with TemporaryHubRepo(token=self._token) as tmp_repo:
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
)
|
||||
url = trainer.push_to_hub()
|
||||
|
||||
# Extract repo_name from the url
|
||||
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
|
||||
self.assertTrue(re_search is not None)
|
||||
repo_name = re_search.groups()[0]
|
||||
|
||||
self.assertEqual(repo_name, f"{USER}/test-trainer")
|
||||
self.assertEqual(repo_name, f"{USER}/{output_dir_name}")
|
||||
|
||||
model = RegressionPreTrainedModel.from_pretrained(repo_name)
|
||||
self.assertEqual(model.a.item(), trainer.model.a.item())
|
||||
self.assertEqual(model.b.item(), trainer.model.b.item())
|
||||
|
||||
def test_push_to_hub_in_organization(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(output_dir=tmp_dir)
|
||||
trainer.save_model()
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer-org"),
|
||||
push_to_hub=True,
|
||||
hub_model_id="valid_org/test-trainer-org",
|
||||
hub_token=self._token,
|
||||
)
|
||||
url = trainer.push_to_hub()
|
||||
with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(output_dir=tmp_dir)
|
||||
trainer.save_model()
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
push_to_hub=True,
|
||||
hub_model_id=f"valid_org/{output_dir_name}",
|
||||
hub_token=self._token,
|
||||
)
|
||||
url = trainer.push_to_hub()
|
||||
|
||||
# Extract repo_name from the url
|
||||
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
|
||||
self.assertTrue(re_search is not None)
|
||||
repo_name = re_search.groups()[0]
|
||||
self.assertEqual(repo_name, "valid_org/test-trainer-org")
|
||||
self.assertEqual(repo_name, f"valid_org/{output_dir_name}")
|
||||
|
||||
model = RegressionPreTrainedModel.from_pretrained("valid_org/test-trainer-org")
|
||||
model = RegressionPreTrainedModel.from_pretrained(f"valid_org/{output_dir_name}")
|
||||
self.assertEqual(model.a.item(), trainer.model.a.item())
|
||||
self.assertEqual(model.b.item(), trainer.model.b.item())
|
||||
|
||||
@@ -4226,120 +4211,130 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
|
||||
return [commit.strip() for commit in commits]
|
||||
|
||||
def test_push_to_hub_with_saves_each_epoch(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
with self.assertLogs(level="WARNING") as logs:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
# To avoid any flakiness if the training goes faster than the uploads.
|
||||
hub_always_push=True,
|
||||
save_strategy="epoch",
|
||||
)
|
||||
trainer.train()
|
||||
with TemporaryHubRepo(token=self._token) as tmp_repo:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
with self.assertLogs(level="WARNING") as logs:
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
# To avoid any flakiness if the training goes faster than the uploads.
|
||||
hub_always_push=True,
|
||||
save_strategy="epoch",
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
|
||||
commits = [c.title for c in commits]
|
||||
self.assertIn("initial commit", commits)
|
||||
self.assertIn("Training in progress, epoch 1", commits)
|
||||
self.assertIn("Training in progress, epoch 2", commits)
|
||||
# Epochs 3 and 4 are not guaranteed to be present (empty commits)
|
||||
self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records))
|
||||
commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token)
|
||||
commits = [c.title for c in commits]
|
||||
self.assertIn("initial commit", commits)
|
||||
self.assertIn("Training in progress, epoch 1", commits)
|
||||
self.assertIn("Training in progress, epoch 2", commits)
|
||||
# Epochs 3 and 4 are not guaranteed to be present (empty commits)
|
||||
self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records))
|
||||
|
||||
def test_push_to_hub_with_saves_each_n_steps(self):
|
||||
num_gpus = max(1, backend_device_count(torch_device))
|
||||
if num_gpus > 2:
|
||||
self.skipTest(reason="More than 2 GPUs available")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
with self.assertLogs(level="WARNING") as logs:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer-step"),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
# To avoid any flakiness if the training goes faster than the uploads.
|
||||
hub_always_push=True,
|
||||
save_strategy="steps",
|
||||
save_steps=5,
|
||||
)
|
||||
trainer.train()
|
||||
with TemporaryHubRepo(token=self._token) as tmp_repo:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
with self.assertLogs(level="WARNING") as logs:
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
# To avoid any flakiness if the training goes faster than the uploads.
|
||||
hub_always_push=True,
|
||||
save_strategy="steps",
|
||||
save_steps=5,
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
|
||||
commits = [c.title for c in commits]
|
||||
self.assertIn("initial commit", commits)
|
||||
commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token)
|
||||
commits = [c.title for c in commits]
|
||||
self.assertIn("initial commit", commits)
|
||||
|
||||
# Some commits are skipped if nothing has changed
|
||||
# We expect 1 commit per 5 epochs + 1 commit at the end
|
||||
nb_empty_commits = len(
|
||||
[record for record in logs.records if "Skipping to prevent empty commit." in record.message]
|
||||
)
|
||||
nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit])
|
||||
# Some commits are skipped if nothing has changed
|
||||
# We expect 1 commit per 5 epochs + 1 commit at the end
|
||||
nb_empty_commits = len(
|
||||
[record for record in logs.records if "Skipping to prevent empty commit." in record.message]
|
||||
)
|
||||
nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit])
|
||||
|
||||
# max_steps depend on the number of available GPUs
|
||||
max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
|
||||
nb_expected_commits = len(range(5, max_steps, 5))
|
||||
# max_steps depend on the number of available GPUs
|
||||
max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
|
||||
nb_expected_commits = len(range(5, max_steps, 5))
|
||||
|
||||
# '>=' since final commit might be an empty commit as well (not deterministic)
|
||||
self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits)
|
||||
# '>=' since final commit might be an empty commit as well (not deterministic)
|
||||
self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits)
|
||||
|
||||
@require_tensorboard
|
||||
def test_push_to_hub_with_tensorboard_logs(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"),
|
||||
hub_token=self._token,
|
||||
save_strategy="epoch",
|
||||
report_to=["tensorboard"],
|
||||
keep_report_to=True,
|
||||
)
|
||||
trainer.train()
|
||||
# Push the runs via `push_to_hub()`
|
||||
trainer.push_to_hub()
|
||||
with TemporaryHubRepo(token=self._token) as tmp_repo:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
hub_token=self._token,
|
||||
save_strategy="epoch",
|
||||
report_to=["tensorboard"],
|
||||
keep_report_to=True,
|
||||
)
|
||||
trainer.train()
|
||||
# Push the runs via `push_to_hub()`
|
||||
trainer.push_to_hub()
|
||||
|
||||
files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token)
|
||||
found_log = False
|
||||
for f in files:
|
||||
if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
|
||||
found_log = True
|
||||
files = list_repo_files(f"{USER}/{output_dir_name}", token=self._token)
|
||||
found_log = False
|
||||
for f in files:
|
||||
if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
|
||||
found_log = True
|
||||
|
||||
assert found_log is True, "No tensorboard log found in repo"
|
||||
assert found_log is True, "No tensorboard log found in repo"
|
||||
|
||||
def test_push_to_hub_tags(self):
|
||||
# Checks if `trainer.push_to_hub()` works correctly by adding the desired
|
||||
# tag without having to pass `tags` in `push_to_hub`
|
||||
# see:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer-tags"),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
)
|
||||
with TemporaryHubRepo(token=self._token) as tmp_repo:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
)
|
||||
|
||||
trainer.model.add_model_tags(["test-trainer-tags"])
|
||||
trainer.model.add_model_tags(["test-trainer-tags"])
|
||||
|
||||
url = trainer.push_to_hub()
|
||||
url = trainer.push_to_hub()
|
||||
|
||||
# Extract repo_name from the url
|
||||
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
|
||||
self.assertTrue(re_search is not None)
|
||||
repo_name = re_search.groups()[0]
|
||||
|
||||
self.assertEqual(repo_name, f"{USER}/test-trainer-tags")
|
||||
self.assertEqual(repo_name, f"{USER}/{output_dir_name}")
|
||||
|
||||
model_card = ModelCard.load(repo_name)
|
||||
self.assertTrue("test-trainer-tags" in model_card.data.tags)
|
||||
|
||||
def test_push_to_hub_with_revision(self):
|
||||
# Checks if `trainer.push_to_hub()` works correctly by adding revision
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, "test-trainer-revision"),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
)
|
||||
branch = "v1.0"
|
||||
create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True)
|
||||
url = trainer.push_to_hub(revision=branch)
|
||||
with TemporaryHubRepo(token=self._token) as tmp_repo:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
output_dir_name = tmp_repo.repo_name
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=os.path.join(tmp_dir, output_dir_name),
|
||||
push_to_hub=True,
|
||||
hub_token=self._token,
|
||||
)
|
||||
branch = "v1.0"
|
||||
create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True)
|
||||
url = trainer.push_to_hub(revision=branch)
|
||||
|
||||
# Extract branch from the url
|
||||
re_search = re.search(r"tree/([^/]+)/", url)
|
||||
|
||||
Reference in New Issue
Block a user