Fix flaky Hub CI (test_trainer.py) (#35062)

* fix

* Update src/transformers/testing_utils.py

Co-authored-by: Lucain <lucainp@gmail.com>

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* check

* check

* check

* check

* check

* check

* Update src/transformers/testing_utils.py

Co-authored-by: Lucain <lucainp@gmail.com>

* Update src/transformers/testing_utils.py

Co-authored-by: Lucain <lucainp@gmail.com>

* check

* check

* check

* Final space

* Final adjustment

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Lucain <lucainp@gmail.com>
This commit is contained in:
Yih-Dar
2024-12-05 17:02:27 +01:00
committed by GitHub
parent a928d9c128
commit b0a51e5cff
11 changed files with 670 additions and 922 deletions

View File

@@ -32,10 +32,9 @@ from typing import Dict, List
from unittest.mock import Mock, patch
import numpy as np
from huggingface_hub import HfFolder, ModelCard, create_branch, delete_repo, list_repo_commits, list_repo_files
from huggingface_hub import HfFolder, ModelCard, create_branch, list_repo_commits, list_repo_files
from packaging import version
from parameterized import parameterized
from requests.exceptions import HTTPError
from transformers import (
AutoFeatureExtractor,
@@ -59,6 +58,7 @@ from transformers.testing_utils import (
USER,
CaptureLogger,
LoggingLevel,
TemporaryHubRepo,
TestCasePlus,
backend_device_count,
execute_subprocess_async,
@@ -4152,64 +4152,49 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
cls._token = TOKEN
HfFolder.save_token(TOKEN)
@classmethod
def tearDownClass(cls):
for model in [
"test-trainer",
"test-trainer-epoch",
"test-trainer-step",
"test-trainer-tensorboard",
"test-trainer-tags",
]:
try:
delete_repo(token=cls._token, repo_id=model)
except HTTPError:
pass
try:
delete_repo(token=cls._token, repo_id="valid_org/test-trainer-org")
except HTTPError:
pass
def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer"),
push_to_hub=True,
hub_token=self._token,
)
url = trainer.push_to_hub()
with TemporaryHubRepo(token=self._token) as tmp_repo:
output_dir_name = tmp_repo.repo_name
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
push_to_hub=True,
hub_token=self._token,
)
url = trainer.push_to_hub()
# Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0]
self.assertEqual(repo_name, f"{USER}/test-trainer")
self.assertEqual(repo_name, f"{USER}/{output_dir_name}")
model = RegressionPreTrainedModel.from_pretrained(repo_name)
self.assertEqual(model.a.item(), trainer.model.a.item())
self.assertEqual(model.b.item(), trainer.model.b.item())
def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(output_dir=tmp_dir)
trainer.save_model()
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-org"),
push_to_hub=True,
hub_model_id="valid_org/test-trainer-org",
hub_token=self._token,
)
url = trainer.push_to_hub()
with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(output_dir=tmp_dir)
trainer.save_model()
output_dir_name = tmp_repo.repo_name
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
push_to_hub=True,
hub_model_id=f"valid_org/{output_dir_name}",
hub_token=self._token,
)
url = trainer.push_to_hub()
# Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0]
self.assertEqual(repo_name, "valid_org/test-trainer-org")
self.assertEqual(repo_name, f"valid_org/{output_dir_name}")
model = RegressionPreTrainedModel.from_pretrained("valid_org/test-trainer-org")
model = RegressionPreTrainedModel.from_pretrained(f"valid_org/{output_dir_name}")
self.assertEqual(model.a.item(), trainer.model.a.item())
self.assertEqual(model.b.item(), trainer.model.b.item())
@@ -4226,120 +4211,130 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
return [commit.strip() for commit in commits]
def test_push_to_hub_with_saves_each_epoch(self):
with tempfile.TemporaryDirectory() as tmp_dir:
with self.assertLogs(level="WARNING") as logs:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="epoch",
)
trainer.train()
with TemporaryHubRepo(token=self._token) as tmp_repo:
with tempfile.TemporaryDirectory() as tmp_dir:
with self.assertLogs(level="WARNING") as logs:
output_dir_name = tmp_repo.repo_name
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="epoch",
)
trainer.train()
commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
commits = [c.title for c in commits]
self.assertIn("initial commit", commits)
self.assertIn("Training in progress, epoch 1", commits)
self.assertIn("Training in progress, epoch 2", commits)
# Epochs 3 and 4 are not guaranteed to be present (empty commits)
self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records))
commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token)
commits = [c.title for c in commits]
self.assertIn("initial commit", commits)
self.assertIn("Training in progress, epoch 1", commits)
self.assertIn("Training in progress, epoch 2", commits)
# Epochs 3 and 4 are not guaranteed to be present (empty commits)
self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records))
def test_push_to_hub_with_saves_each_n_steps(self):
num_gpus = max(1, backend_device_count(torch_device))
if num_gpus > 2:
self.skipTest(reason="More than 2 GPUs available")
with tempfile.TemporaryDirectory() as tmp_dir:
with self.assertLogs(level="WARNING") as logs:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-step"),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="steps",
save_steps=5,
)
trainer.train()
with TemporaryHubRepo(token=self._token) as tmp_repo:
with tempfile.TemporaryDirectory() as tmp_dir:
with self.assertLogs(level="WARNING") as logs:
output_dir_name = tmp_repo.repo_name
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="steps",
save_steps=5,
)
trainer.train()
commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
commits = [c.title for c in commits]
self.assertIn("initial commit", commits)
commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token)
commits = [c.title for c in commits]
self.assertIn("initial commit", commits)
# Some commits are skipped if nothing has changed
# We expect 1 commit per 5 epochs + 1 commit at the end
nb_empty_commits = len(
[record for record in logs.records if "Skipping to prevent empty commit." in record.message]
)
nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit])
# Some commits are skipped if nothing has changed
# We expect 1 commit per 5 epochs + 1 commit at the end
nb_empty_commits = len(
[record for record in logs.records if "Skipping to prevent empty commit." in record.message]
)
nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit])
# max_steps depend on the number of available GPUs
max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
nb_expected_commits = len(range(5, max_steps, 5))
# max_steps depend on the number of available GPUs
max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
nb_expected_commits = len(range(5, max_steps, 5))
# '>=' since final commit might be an empty commit as well (not deterministic)
self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits)
# '>=' since final commit might be an empty commit as well (not deterministic)
self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits)
@require_tensorboard
def test_push_to_hub_with_tensorboard_logs(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"),
hub_token=self._token,
save_strategy="epoch",
report_to=["tensorboard"],
keep_report_to=True,
)
trainer.train()
# Push the runs via `push_to_hub()`
trainer.push_to_hub()
with TemporaryHubRepo(token=self._token) as tmp_repo:
with tempfile.TemporaryDirectory() as tmp_dir:
output_dir_name = tmp_repo.repo_name
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
hub_token=self._token,
save_strategy="epoch",
report_to=["tensorboard"],
keep_report_to=True,
)
trainer.train()
# Push the runs via `push_to_hub()`
trainer.push_to_hub()
files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token)
found_log = False
for f in files:
if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
found_log = True
files = list_repo_files(f"{USER}/{output_dir_name}", token=self._token)
found_log = False
for f in files:
if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
found_log = True
assert found_log is True, "No tensorboard log found in repo"
assert found_log is True, "No tensorboard log found in repo"
def test_push_to_hub_tags(self):
# Checks if `trainer.push_to_hub()` works correctly by adding the desired
# tag without having to pass `tags` in `push_to_hub`
# see:
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-tags"),
push_to_hub=True,
hub_token=self._token,
)
with TemporaryHubRepo(token=self._token) as tmp_repo:
with tempfile.TemporaryDirectory() as tmp_dir:
output_dir_name = tmp_repo.repo_name
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
push_to_hub=True,
hub_token=self._token,
)
trainer.model.add_model_tags(["test-trainer-tags"])
trainer.model.add_model_tags(["test-trainer-tags"])
url = trainer.push_to_hub()
url = trainer.push_to_hub()
# Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0]
self.assertEqual(repo_name, f"{USER}/test-trainer-tags")
self.assertEqual(repo_name, f"{USER}/{output_dir_name}")
model_card = ModelCard.load(repo_name)
self.assertTrue("test-trainer-tags" in model_card.data.tags)
def test_push_to_hub_with_revision(self):
# Checks if `trainer.push_to_hub()` works correctly by adding revision
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-revision"),
push_to_hub=True,
hub_token=self._token,
)
branch = "v1.0"
create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True)
url = trainer.push_to_hub(revision=branch)
with TemporaryHubRepo(token=self._token) as tmp_repo:
with tempfile.TemporaryDirectory() as tmp_dir:
output_dir_name = tmp_repo.repo_name
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, output_dir_name),
push_to_hub=True,
hub_token=self._token,
)
branch = "v1.0"
create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True)
url = trainer.push_to_hub(revision=branch)
# Extract branch from the url
re_search = re.search(r"tree/([^/]+)/", url)