Fix flaky Hub CI (test_trainer.py) (#35062)

* fix

* Update src/transformers/testing_utils.py

Co-authored-by: Lucain <lucainp@gmail.com>

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* check

* check

* check

* check

* check

* check

* Update src/transformers/testing_utils.py

Co-authored-by: Lucain <lucainp@gmail.com>

* Update src/transformers/testing_utils.py

Co-authored-by: Lucain <lucainp@gmail.com>

* check

* check

* check

* Final space

* Final adjustment

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Lucain <lucainp@gmail.com>
This commit is contained in:
Yih-Dar
2024-12-05 17:02:27 +01:00
committed by GitHub
parent a928d9c128
commit b0a51e5cff
11 changed files with 670 additions and 922 deletions

View File

@@ -40,7 +40,9 @@ from typing import Callable, Dict, Iterable, Iterator, List, Optional, Union
from unittest import mock from unittest import mock
from unittest.mock import patch from unittest.mock import patch
import huggingface_hub.utils
import urllib3 import urllib3
from huggingface_hub import delete_repo
from transformers import logging as transformers_logging from transformers import logging as transformers_logging
@@ -1570,6 +1572,38 @@ def LoggingLevel(level):
transformers_logging.set_verbosity(orig_level) transformers_logging.set_verbosity(orig_level)
class TemporaryHubRepo:
"""Create a temporary Hub repository and return its `RepoUrl` object. This is similar to
`tempfile.TemporaryDirectory` and can be used as a context manager. For example:
with TemporaryHubRepo(token=self._token) as temp_repo:
...
Upon exiting the context, the repository and everything contained in it are removed.
Example:
```python
with TemporaryHubRepo(token=self._token) as temp_repo:
model.push_to_hub(tmp_repo.repo_id, token=self._token)
```
"""
def __init__(self, namespace: Optional[str] = None, token: Optional[str] = None) -> None:
self.token = token
with tempfile.TemporaryDirectory() as tmp_dir:
repo_id = Path(tmp_dir).name
if namespace is not None:
repo_id = f"{namespace}/{repo_id}"
self.repo_url = huggingface_hub.create_repo(repo_id, token=self.token)
def __enter__(self):
return self.repo_url
def __exit__(self, exc, value, tb):
delete_repo(repo_id=self.repo_url.repo_id, token=self.token, missing_ok=True)
@contextlib.contextmanager @contextlib.contextmanager
# adapted from https://stackoverflow.com/a/64789046/9201239 # adapted from https://stackoverflow.com/a/64789046/9201239
def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]: def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:

View File

@@ -18,9 +18,8 @@ import os
import tempfile import tempfile
import unittest import unittest
import warnings import warnings
from pathlib import Path
from huggingface_hub import HfFolder, create_pull_request, create_repo, delete_repo from huggingface_hub import HfFolder, create_pull_request
from parameterized import parameterized from parameterized import parameterized
from transformers import AutoConfig, GenerationConfig, WatermarkingConfig, is_torch_available from transformers import AutoConfig, GenerationConfig, WatermarkingConfig, is_torch_available
@@ -57,7 +56,7 @@ from transformers.generation import (
UnbatchedClassifierFreeGuidanceLogitsProcessor, UnbatchedClassifierFreeGuidanceLogitsProcessor,
WatermarkLogitsProcessor, WatermarkLogitsProcessor,
) )
from transformers.testing_utils import TOKEN, USER, is_staging_test, torch_device from transformers.testing_utils import TOKEN, TemporaryHubRepo, is_staging_test, torch_device
class GenerationConfigTest(unittest.TestCase): class GenerationConfigTest(unittest.TestCase):
@@ -679,114 +678,82 @@ class ConfigPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = GenerationConfig(
tmp_repo = f"{USER}/test-generation-config-{Path(tmp_dir).name}" do_sample=True,
config = GenerationConfig( temperature=0.7,
do_sample=True, length_penalty=1.0,
temperature=0.7, )
length_penalty=1.0, config.push_to_hub(tmp_repo.repo_id, token=self._token)
)
config.push_to_hub(tmp_repo, token=self._token)
new_config = GenerationConfig.from_pretrained(tmp_repo) new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = GenerationConfig(
tmp_repo = f"{USER}/test-generation-config-{Path(tmp_dir).name}" do_sample=True,
config = GenerationConfig( temperature=0.7,
do_sample=True, length_penalty=1.0,
temperature=0.7, )
length_penalty=1.0, # Push to hub via save_pretrained
) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained config.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
config.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_config = GenerationConfig.from_pretrained(tmp_repo) new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = GenerationConfig(
tmp_repo = f"valid_org/test-generation-config-org-{Path(tmp_dir).name}" do_sample=True,
config = GenerationConfig( temperature=0.7,
do_sample=True, length_penalty=1.0,
temperature=0.7, )
length_penalty=1.0, config.push_to_hub(tmp_repo.repo_id, token=self._token)
)
config.push_to_hub(tmp_repo, token=self._token)
new_config = GenerationConfig.from_pretrained(tmp_repo) new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = GenerationConfig(
tmp_repo = f"valid_org/test-generation-config-org-{Path(tmp_dir).name}" do_sample=True,
config = GenerationConfig( temperature=0.7,
do_sample=True, length_penalty=1.0,
temperature=0.7, )
length_penalty=1.0, # Push to hub via save_pretrained
) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained config.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
config.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_config = GenerationConfig.from_pretrained(tmp_repo) new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_on_pr_revision(self): def test_push_to_hub_on_pr_revision(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: # create a PR
# create a repo and a PR pr = create_pull_request(repo_id=tmp_repo.repo_id, title="Test PR", token=self._token)
repo_id = f"{USER}/test-generation-config-{Path(tmp_dir).name}" revision = f"refs/pr/{pr.num}"
create_repo(repo_id=repo_id, token=self._token)
pr = create_pull_request(repo_id=repo_id, title="Test PR", token=self._token)
revision = f"refs/pr/{pr.num}"
# push to PR ref # push to PR ref
config = GenerationConfig( config = GenerationConfig(
do_sample=True, do_sample=True,
temperature=0.7, temperature=0.7,
length_penalty=1.0, length_penalty=1.0,
) )
config.push_to_hub(repo_id, token=self._token, revision=revision) config.push_to_hub(tmp_repo.repo_id, token=self._token, revision=revision)
# load from PR ref # load from PR ref
new_config = GenerationConfig.from_pretrained(repo_id, revision=revision) new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id, revision=revision)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=repo_id, token=self._token)

View File

@@ -21,7 +21,7 @@ import unittest
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from huggingface_hub import HfFolder, Repository, create_repo, delete_repo from huggingface_hub import HfFolder, Repository
import transformers import transformers
from transformers import ( from transformers import (
@@ -39,7 +39,7 @@ from transformers import (
Wav2Vec2FeatureExtractor, Wav2Vec2FeatureExtractor,
Wav2Vec2Processor, Wav2Vec2Processor,
) )
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
@@ -372,72 +372,52 @@ class ProcessorPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
tmp_repo = f"{USER}/test-processor-{Path(tmp_dir).name}" # Push to hub via save_pretrained
processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
processor.save_pretrained(tmp_repo, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo) new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id)
for k, v in processor.feature_extractor.__dict__.items(): for k, v in processor.feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_processor.feature_extractor, k)) self.assertEqual(v, getattr(new_processor.feature_extractor, k))
self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab()) self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
tmp_repo = f"valid_org/test-processor-org-{Path(tmp_dir).name}" # Push to hub via save_pretrained
processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained
processor.save_pretrained( processor.save_pretrained(
tmp_dir, tmp_dir,
repo_id=tmp_repo, repo_id=tmp_repo.repo_id,
push_to_hub=True, push_to_hub=True,
token=self._token, token=self._token,
) )
new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo) new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id)
for k, v in processor.feature_extractor.__dict__.items(): for k, v in processor.feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_processor.feature_extractor, k)) self.assertEqual(v, getattr(new_processor.feature_extractor, k))
self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab()) self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_dynamic_processor(self): def test_push_to_hub_dynamic_processor(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: CustomFeatureExtractor.register_for_auto_class()
tmp_repo = f"{USER}/test-dynamic-processor-{Path(tmp_dir).name}" CustomTokenizer.register_for_auto_class()
CustomProcessor.register_for_auto_class()
CustomFeatureExtractor.register_for_auto_class() feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
CustomTokenizer.register_for_auto_class()
CustomProcessor.register_for_auto_class()
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file)
with tempfile.TemporaryDirectory() as tmp_dir: processor = CustomProcessor(feature_extractor, tokenizer)
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file)
processor = CustomProcessor(feature_extractor, tokenizer) with tempfile.TemporaryDirectory() as tmp_dir:
create_repo(tmp_repo, token=self._token)
repo = Repository(tmp_dir, clone_from=tmp_repo, token=self._token) repo = Repository(tmp_dir, clone_from=tmp_repo, token=self._token)
processor.save_pretrained(tmp_dir) processor.save_pretrained(tmp_dir)
@@ -468,10 +448,6 @@ class ProcessorPushToHubTester(unittest.TestCase):
repo.push_to_hub() repo.push_to_hub()
new_processor = AutoProcessor.from_pretrained(tmp_repo, trust_remote_code=True) new_processor = AutoProcessor.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_processor is from the CustomProcessor class of a dynamic module # Can't make an isinstance check because the new_processor is from the CustomProcessor class of a dynamic module
self.assertEqual(new_processor.__class__.__name__, "CustomProcessor") self.assertEqual(new_processor.__class__.__name__, "CustomProcessor")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)

View File

@@ -32,10 +32,9 @@ from typing import Dict, List
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
import numpy as np import numpy as np
from huggingface_hub import HfFolder, ModelCard, create_branch, delete_repo, list_repo_commits, list_repo_files from huggingface_hub import HfFolder, ModelCard, create_branch, list_repo_commits, list_repo_files
from packaging import version from packaging import version
from parameterized import parameterized from parameterized import parameterized
from requests.exceptions import HTTPError
from transformers import ( from transformers import (
AutoFeatureExtractor, AutoFeatureExtractor,
@@ -59,6 +58,7 @@ from transformers.testing_utils import (
USER, USER,
CaptureLogger, CaptureLogger,
LoggingLevel, LoggingLevel,
TemporaryHubRepo,
TestCasePlus, TestCasePlus,
backend_device_count, backend_device_count,
execute_subprocess_async, execute_subprocess_async,
@@ -4152,64 +4152,49 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@classmethod
def tearDownClass(cls):
for model in [
"test-trainer",
"test-trainer-epoch",
"test-trainer-step",
"test-trainer-tensorboard",
"test-trainer-tags",
]:
try:
delete_repo(token=cls._token, repo_id=model)
except HTTPError:
pass
try:
delete_repo(token=cls._token, repo_id="valid_org/test-trainer-org")
except HTTPError:
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
trainer = get_regression_trainer( output_dir_name = tmp_repo.repo_name
output_dir=os.path.join(tmp_dir, "test-trainer"), with tempfile.TemporaryDirectory() as tmp_dir:
push_to_hub=True, trainer = get_regression_trainer(
hub_token=self._token, output_dir=os.path.join(tmp_dir, output_dir_name),
) push_to_hub=True,
url = trainer.push_to_hub() hub_token=self._token,
)
url = trainer.push_to_hub()
# Extract repo_name from the url # Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None) self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0] repo_name = re_search.groups()[0]
self.assertEqual(repo_name, f"{USER}/test-trainer") self.assertEqual(repo_name, f"{USER}/{output_dir_name}")
model = RegressionPreTrainedModel.from_pretrained(repo_name) model = RegressionPreTrainedModel.from_pretrained(repo_name)
self.assertEqual(model.a.item(), trainer.model.a.item()) self.assertEqual(model.a.item(), trainer.model.a.item())
self.assertEqual(model.b.item(), trainer.model.b.item()) self.assertEqual(model.b.item(), trainer.model.b.item())
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
trainer = get_regression_trainer(output_dir=tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir:
trainer.save_model() trainer = get_regression_trainer(output_dir=tmp_dir)
trainer = get_regression_trainer( trainer.save_model()
output_dir=os.path.join(tmp_dir, "test-trainer-org"), output_dir_name = tmp_repo.repo_name
push_to_hub=True, trainer = get_regression_trainer(
hub_model_id="valid_org/test-trainer-org", output_dir=os.path.join(tmp_dir, output_dir_name),
hub_token=self._token, push_to_hub=True,
) hub_model_id=f"valid_org/{output_dir_name}",
url = trainer.push_to_hub() hub_token=self._token,
)
url = trainer.push_to_hub()
# Extract repo_name from the url # Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None) self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0] repo_name = re_search.groups()[0]
self.assertEqual(repo_name, "valid_org/test-trainer-org") self.assertEqual(repo_name, f"valid_org/{output_dir_name}")
model = RegressionPreTrainedModel.from_pretrained("valid_org/test-trainer-org") model = RegressionPreTrainedModel.from_pretrained(f"valid_org/{output_dir_name}")
self.assertEqual(model.a.item(), trainer.model.a.item()) self.assertEqual(model.a.item(), trainer.model.a.item())
self.assertEqual(model.b.item(), trainer.model.b.item()) self.assertEqual(model.b.item(), trainer.model.b.item())
@@ -4226,120 +4211,130 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
return [commit.strip() for commit in commits] return [commit.strip() for commit in commits]
def test_push_to_hub_with_saves_each_epoch(self): def test_push_to_hub_with_saves_each_epoch(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
with self.assertLogs(level="WARNING") as logs: with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer( with self.assertLogs(level="WARNING") as logs:
output_dir=os.path.join(tmp_dir, "test-trainer-epoch"), output_dir_name = tmp_repo.repo_name
push_to_hub=True, trainer = get_regression_trainer(
hub_token=self._token, output_dir=os.path.join(tmp_dir, output_dir_name),
# To avoid any flakiness if the training goes faster than the uploads. push_to_hub=True,
hub_always_push=True, hub_token=self._token,
save_strategy="epoch", # To avoid any flakiness if the training goes faster than the uploads.
) hub_always_push=True,
trainer.train() save_strategy="epoch",
)
trainer.train()
commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token) commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token)
commits = [c.title for c in commits] commits = [c.title for c in commits]
self.assertIn("initial commit", commits) self.assertIn("initial commit", commits)
self.assertIn("Training in progress, epoch 1", commits) self.assertIn("Training in progress, epoch 1", commits)
self.assertIn("Training in progress, epoch 2", commits) self.assertIn("Training in progress, epoch 2", commits)
# Epochs 3 and 4 are not guaranteed to be present (empty commits) # Epochs 3 and 4 are not guaranteed to be present (empty commits)
self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records)) self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records))
def test_push_to_hub_with_saves_each_n_steps(self): def test_push_to_hub_with_saves_each_n_steps(self):
num_gpus = max(1, backend_device_count(torch_device)) num_gpus = max(1, backend_device_count(torch_device))
if num_gpus > 2: if num_gpus > 2:
self.skipTest(reason="More than 2 GPUs available") self.skipTest(reason="More than 2 GPUs available")
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
with self.assertLogs(level="WARNING") as logs: with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer( with self.assertLogs(level="WARNING") as logs:
output_dir=os.path.join(tmp_dir, "test-trainer-step"), output_dir_name = tmp_repo.repo_name
push_to_hub=True, trainer = get_regression_trainer(
hub_token=self._token, output_dir=os.path.join(tmp_dir, output_dir_name),
# To avoid any flakiness if the training goes faster than the uploads. push_to_hub=True,
hub_always_push=True, hub_token=self._token,
save_strategy="steps", # To avoid any flakiness if the training goes faster than the uploads.
save_steps=5, hub_always_push=True,
) save_strategy="steps",
trainer.train() save_steps=5,
)
trainer.train()
commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token) commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token)
commits = [c.title for c in commits] commits = [c.title for c in commits]
self.assertIn("initial commit", commits) self.assertIn("initial commit", commits)
# Some commits are skipped if nothing has changed # Some commits are skipped if nothing has changed
# We expect 1 commit per 5 epochs + 1 commit at the end # We expect 1 commit per 5 epochs + 1 commit at the end
nb_empty_commits = len( nb_empty_commits = len(
[record for record in logs.records if "Skipping to prevent empty commit." in record.message] [record for record in logs.records if "Skipping to prevent empty commit." in record.message]
) )
nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit]) nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit])
# max_steps depend on the number of available GPUs # max_steps depend on the number of available GPUs
max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader())) max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
nb_expected_commits = len(range(5, max_steps, 5)) nb_expected_commits = len(range(5, max_steps, 5))
# '>=' since final commit might be an empty commit as well (not deterministic) # '>=' since final commit might be an empty commit as well (not deterministic)
self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits) self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits)
@require_tensorboard @require_tensorboard
def test_push_to_hub_with_tensorboard_logs(self): def test_push_to_hub_with_tensorboard_logs(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
trainer = get_regression_trainer( with tempfile.TemporaryDirectory() as tmp_dir:
output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"), output_dir_name = tmp_repo.repo_name
hub_token=self._token, trainer = get_regression_trainer(
save_strategy="epoch", output_dir=os.path.join(tmp_dir, output_dir_name),
report_to=["tensorboard"], hub_token=self._token,
keep_report_to=True, save_strategy="epoch",
) report_to=["tensorboard"],
trainer.train() keep_report_to=True,
# Push the runs via `push_to_hub()` )
trainer.push_to_hub() trainer.train()
# Push the runs via `push_to_hub()`
trainer.push_to_hub()
files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token) files = list_repo_files(f"{USER}/{output_dir_name}", token=self._token)
found_log = False found_log = False
for f in files: for f in files:
if len(f.split("runs")) > 1 and "events.out.tfevents" in f: if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
found_log = True found_log = True
assert found_log is True, "No tensorboard log found in repo" assert found_log is True, "No tensorboard log found in repo"
def test_push_to_hub_tags(self): def test_push_to_hub_tags(self):
# Checks if `trainer.push_to_hub()` works correctly by adding the desired # Checks if `trainer.push_to_hub()` works correctly by adding the desired
# tag without having to pass `tags` in `push_to_hub` # tag without having to pass `tags` in `push_to_hub`
# see: # see:
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
trainer = get_regression_trainer( with tempfile.TemporaryDirectory() as tmp_dir:
output_dir=os.path.join(tmp_dir, "test-trainer-tags"), output_dir_name = tmp_repo.repo_name
push_to_hub=True, trainer = get_regression_trainer(
hub_token=self._token, output_dir=os.path.join(tmp_dir, output_dir_name),
) push_to_hub=True,
hub_token=self._token,
)
trainer.model.add_model_tags(["test-trainer-tags"]) trainer.model.add_model_tags(["test-trainer-tags"])
url = trainer.push_to_hub() url = trainer.push_to_hub()
# Extract repo_name from the url # Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None) self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0] repo_name = re_search.groups()[0]
self.assertEqual(repo_name, f"{USER}/test-trainer-tags") self.assertEqual(repo_name, f"{USER}/{output_dir_name}")
model_card = ModelCard.load(repo_name) model_card = ModelCard.load(repo_name)
self.assertTrue("test-trainer-tags" in model_card.data.tags) self.assertTrue("test-trainer-tags" in model_card.data.tags)
def test_push_to_hub_with_revision(self): def test_push_to_hub_with_revision(self):
# Checks if `trainer.push_to_hub()` works correctly by adding revision # Checks if `trainer.push_to_hub()` works correctly by adding revision
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
trainer = get_regression_trainer( with tempfile.TemporaryDirectory() as tmp_dir:
output_dir=os.path.join(tmp_dir, "test-trainer-revision"), output_dir_name = tmp_repo.repo_name
push_to_hub=True, trainer = get_regression_trainer(
hub_token=self._token, output_dir=os.path.join(tmp_dir, output_dir_name),
) push_to_hub=True,
branch = "v1.0" hub_token=self._token,
create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True) )
url = trainer.push_to_hub(revision=branch) branch = "v1.0"
create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True)
url = trainer.push_to_hub(revision=branch)
# Extract branch from the url # Extract branch from the url
re_search = re.search(r"tree/([^/]+)/", url) re_search = re.search(r"tree/([^/]+)/", url)

View File

@@ -22,12 +22,12 @@ import unittest.mock as mock
import warnings import warnings
from pathlib import Path from pathlib import Path
from huggingface_hub import HfFolder, delete_repo from huggingface_hub import HfFolder
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import AutoConfig, BertConfig, GPT2Config from transformers import AutoConfig, BertConfig, GPT2Config
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from transformers.testing_utils import TOKEN, USER, is_staging_test from transformers.testing_utils import TOKEN, TemporaryHubRepo, is_staging_test
sys.path.append(str(Path(__file__).parent.parent.parent / "utils")) sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
@@ -98,106 +98,72 @@ class ConfigPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-config-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
config.push_to_hub(tmp_repo.repo_id, token=self._token)
config = BertConfig( new_config = BertConfig.from_pretrained(tmp_repo.repo_id)
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 for k, v in config.to_dict().items():
) if k != "transformers_version":
config.push_to_hub(tmp_repo, token=self._token) self.assertEqual(v, getattr(new_config, k))
new_config = BertConfig.from_pretrained(tmp_repo)
for k, v in config.to_dict().items():
if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-config-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
config = BertConfig( new_config = BertConfig.from_pretrained(tmp_repo.repo_id)
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 for k, v in config.to_dict().items():
) if k != "transformers_version":
# Push to hub via save_pretrained self.assertEqual(v, getattr(new_config, k))
config.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_config = BertConfig.from_pretrained(tmp_repo)
for k, v in config.to_dict().items():
if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-config-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 config.push_to_hub(tmp_repo.repo_id, token=self._token)
)
config.push_to_hub(tmp_repo, token=self._token)
new_config = BertConfig.from_pretrained(tmp_repo) new_config = BertConfig.from_pretrained(tmp_repo.repo_id)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-config-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 # Push to hub via save_pretrained
) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained config.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
config.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_config = BertConfig.from_pretrained(tmp_repo) new_config = BertConfig.from_pretrained(tmp_repo.repo_id)
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k != "transformers_version": if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k)) self.assertEqual(v, getattr(new_config, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_dynamic_config(self): def test_push_to_hub_dynamic_config(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: CustomConfig.register_for_auto_class()
tmp_repo = f"{USER}/test-dynamic-config-{Path(tmp_dir).name}" config = CustomConfig(attribute=42)
CustomConfig.register_for_auto_class() config.push_to_hub(tmp_repo.repo_id, token=self._token)
config = CustomConfig(attribute=42)
config.push_to_hub(tmp_repo, token=self._token) # This has added the proper auto_map field to the config
self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
# This has added the proper auto_map field to the config new_config = AutoConfig.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"}) # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
self.assertEqual(new_config.__class__.__name__, "CustomConfig")
new_config = AutoConfig.from_pretrained(tmp_repo, trust_remote_code=True) self.assertEqual(new_config.attribute, 42)
# Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
self.assertEqual(new_config.__class__.__name__, "CustomConfig")
self.assertEqual(new_config.attribute, 42)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
class ConfigTestUtils(unittest.TestCase): class ConfigTestUtils(unittest.TestCase):

View File

@@ -20,11 +20,11 @@ import unittest
import unittest.mock as mock import unittest.mock as mock
from pathlib import Path from pathlib import Path
from huggingface_hub import HfFolder, delete_repo from huggingface_hub import HfFolder
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
sys.path.append(str(Path(__file__).parent.parent.parent / "utils")) sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
@@ -60,91 +60,63 @@ class FeatureExtractorPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
tmp_repo = f"{USER}/test-feature-extractor-{Path(tmp_dir).name}" feature_extractor.push_to_hub(tmp_repo.repo_id, token=self._token)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo.repo_id)
feature_extractor.push_to_hub(tmp_repo, token=self._token) for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo)
for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
tmp_repo = f"{USER}/test-feature-extractor-{Path(tmp_dir).name}" # Push to hub via save_pretrained
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained feature_extractor.save_pretrained(
feature_extractor.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token) tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token
new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo)
for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir:
try:
tmp_repo = f"valid_org/test-feature-extractor-{Path(tmp_dir).name}"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
feature_extractor.push_to_hub(tmp_repo, token=self._token)
new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo)
for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir:
try:
tmp_repo = f"valid_org/test-feature-extractor-{Path(tmp_dir).name}"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
# Push to hub via save_pretrained
feature_extractor.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo)
for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_dynamic_feature_extractor(self):
with tempfile.TemporaryDirectory() as tmp_dir:
try:
tmp_repo = f"{USER}/test-dynamic-feature-extractor-{Path(tmp_dir).name}"
CustomFeatureExtractor.register_for_auto_class()
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
feature_extractor.push_to_hub(tmp_repo, token=self._token)
# This has added the proper auto_map field to the config
self.assertDictEqual(
feature_extractor.auto_map,
{"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
) )
new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_repo, trust_remote_code=True) new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo.repo_id)
# Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module for k, v in feature_extractor.__dict__.items():
self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor") self.assertEqual(v, getattr(new_feature_extractor, k))
finally:
# Always (try to) delete the repo. def test_push_to_hub_in_organization(self):
self._try_delete_repo(repo_id=tmp_repo, token=self._token) with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
feature_extractor.push_to_hub(tmp_repo.repo_id, token=self._token)
new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo.repo_id)
for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
def test_push_to_hub_in_organization_via_save_pretrained(self):
with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
feature_extractor.save_pretrained(
tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token
)
new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(tmp_repo.repo_id)
for k, v in feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_feature_extractor, k))
def test_push_to_hub_dynamic_feature_extractor(self):
with TemporaryHubRepo(token=self._token) as tmp_repo:
CustomFeatureExtractor.register_for_auto_class()
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
feature_extractor.push_to_hub(tmp_repo.repo_id, token=self._token)
# This has added the proper auto_map field to the config
self.assertDictEqual(
feature_extractor.auto_map,
{"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
)
new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")

View File

@@ -19,12 +19,12 @@ import unittest
import unittest.mock as mock import unittest.mock as mock
from pathlib import Path from pathlib import Path
from huggingface_hub import HfFolder, delete_repo from huggingface_hub import HfFolder
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import AutoImageProcessor, ViTImageProcessor from transformers import AutoImageProcessor, ViTImageProcessor
from transformers.image_processing_utils import get_size_dict from transformers.image_processing_utils import get_size_dict
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
sys.path.append(str(Path(__file__).parent.parent.parent / "utils")) sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
@@ -71,93 +71,62 @@ class ImageProcessorPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
tmp_repo = f"{USER}/test-image-processor-{Path(tmp_dir).name}" image_processor.push_to_hub(tmp_repo.repo_id, token=self._token)
image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
image_processor.push_to_hub(tmp_repo, token=self._token)
new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo) new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo.repo_id)
for k, v in image_processor.__dict__.items(): for k, v in image_processor.__dict__.items():
self.assertEqual(v, getattr(new_image_processor, k)) self.assertEqual(v, getattr(new_image_processor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
tmp_repo = f"{USER}/test-image-processor-{Path(tmp_dir).name}" # Push to hub via save_pretrained
image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained image_processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
image_processor.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo) new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo.repo_id)
for k, v in image_processor.__dict__.items(): for k, v in image_processor.__dict__.items():
self.assertEqual(v, getattr(new_image_processor, k)) self.assertEqual(v, getattr(new_image_processor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
tmp_repo = f"valid_org/test-image-processor-{Path(tmp_dir).name}" image_processor.push_to_hub(tmp_repo.repo_id, token=self._token)
image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
image_processor.push_to_hub(tmp_repo, token=self._token)
new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo) new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo.repo_id)
for k, v in image_processor.__dict__.items(): for k, v in image_processor.__dict__.items():
self.assertEqual(v, getattr(new_image_processor, k)) self.assertEqual(v, getattr(new_image_processor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
tmp_repo = f"valid_org/test-image-processor-{Path(tmp_dir).name}" # Push to hub via save_pretrained
image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained image_processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
image_processor.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo) new_image_processor = ViTImageProcessor.from_pretrained(tmp_repo.repo_id)
for k, v in image_processor.__dict__.items(): for k, v in image_processor.__dict__.items():
self.assertEqual(v, getattr(new_image_processor, k)) self.assertEqual(v, getattr(new_image_processor, k))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_dynamic_image_processor(self): def test_push_to_hub_dynamic_image_processor(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: CustomImageProcessor.register_for_auto_class()
tmp_repo = f"{USER}/test-dynamic-image-processor-{Path(tmp_dir).name}" image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
CustomImageProcessor.register_for_auto_class()
image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
image_processor.push_to_hub(tmp_repo, token=self._token) image_processor.push_to_hub(tmp_repo.repo_id, token=self._token)
# This has added the proper auto_map field to the config # This has added the proper auto_map field to the config
self.assertDictEqual( self.assertDictEqual(
image_processor.auto_map, image_processor.auto_map,
{"AutoImageProcessor": "custom_image_processing.CustomImageProcessor"}, {"AutoImageProcessor": "custom_image_processing.CustomImageProcessor"},
) )
new_image_processor = AutoImageProcessor.from_pretrained(tmp_repo, trust_remote_code=True) new_image_processor = AutoImageProcessor.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor") self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
class ImageProcessingUtilsTester(unittest.TestCase): class ImageProcessingUtilsTester(unittest.TestCase):

View File

@@ -14,16 +14,15 @@
import tempfile import tempfile
import unittest import unittest
from pathlib import Path
import numpy as np import numpy as np
from huggingface_hub import HfFolder, delete_repo, snapshot_download from huggingface_hub import HfFolder, snapshot_download
from transformers import BertConfig, BertModel, is_flax_available, is_torch_available from transformers import BertConfig, BertModel, is_flax_available, is_torch_available
from transformers.testing_utils import ( from transformers.testing_utils import (
TOKEN, TOKEN,
USER,
CaptureLogger, CaptureLogger,
TemporaryHubRepo,
is_pt_flax_cross_test, is_pt_flax_cross_test,
is_staging_test, is_staging_test,
require_flax, require_flax,
@@ -55,103 +54,77 @@ class FlaxModelPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-flax-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = FlaxBertModel(config)
) model.push_to_hub(tmp_repo.repo_id, token=self._token)
model = FlaxBertModel(config)
model.push_to_hub(tmp_repo, token=self._token)
new_model = FlaxBertModel.from_pretrained(tmp_repo) new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
base_params = flatten_dict(unfreeze(model.params)) base_params = flatten_dict(unfreeze(model.params))
new_params = flatten_dict(unfreeze(new_model.params)) new_params = flatten_dict(unfreeze(new_model.params))
for key in base_params.keys(): for key in base_params.keys():
max_diff = (base_params[key] - new_params[key]).sum().item() max_diff = (base_params[key] - new_params[key]).sum().item()
self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-flax-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = FlaxBertModel(config)
) # Push to hub via save_pretrained
model = FlaxBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
model.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_model = FlaxBertModel.from_pretrained(tmp_repo) new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
base_params = flatten_dict(unfreeze(model.params)) base_params = flatten_dict(unfreeze(model.params))
new_params = flatten_dict(unfreeze(new_model.params)) new_params = flatten_dict(unfreeze(new_model.params))
for key in base_params.keys(): for key in base_params.keys():
max_diff = (base_params[key] - new_params[key]).sum().item() max_diff = (base_params[key] - new_params[key]).sum().item()
self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-model-flax-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = FlaxBertModel(config)
) model.push_to_hub(tmp_repo.repo_id, token=self._token)
model = FlaxBertModel(config)
model.push_to_hub(tmp_repo, token=self._token)
new_model = FlaxBertModel.from_pretrained(tmp_repo) new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
base_params = flatten_dict(unfreeze(model.params)) base_params = flatten_dict(unfreeze(model.params))
new_params = flatten_dict(unfreeze(new_model.params)) new_params = flatten_dict(unfreeze(new_model.params))
for key in base_params.keys(): for key in base_params.keys():
max_diff = (base_params[key] - new_params[key]).sum().item() max_diff = (base_params[key] - new_params[key]).sum().item()
self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-model-flax-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = FlaxBertModel(config)
) # Push to hub via save_pretrained
model = FlaxBertModel(config) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
model.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_model = FlaxBertModel.from_pretrained(tmp_repo) new_model = FlaxBertModel.from_pretrained(tmp_repo.repo_id)
base_params = flatten_dict(unfreeze(model.params)) base_params = flatten_dict(unfreeze(model.params))
new_params = flatten_dict(unfreeze(new_model.params)) new_params = flatten_dict(unfreeze(new_model.params))
for key in base_params.keys(): for key in base_params.keys():
max_diff = (base_params[key] - new_params[key]).sum().item() max_diff = (base_params[key] - new_params[key]).sum().item()
self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def check_models_equal(model1, model2): def check_models_equal(model1, model2):

View File

@@ -23,9 +23,8 @@ import random
import tempfile import tempfile
import unittest import unittest
import unittest.mock as mock import unittest.mock as mock
from pathlib import Path
from huggingface_hub import HfFolder, Repository, delete_repo, snapshot_download from huggingface_hub import HfFolder, Repository, snapshot_download
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import is_tf_available, is_torch_available from transformers import is_tf_available, is_torch_available
@@ -34,6 +33,7 @@ from transformers.testing_utils import ( # noqa: F401
TOKEN, TOKEN,
USER, USER,
CaptureLogger, CaptureLogger,
TemporaryHubRepo,
_tf_gpu_memory_limit, _tf_gpu_memory_limit,
is_pt_tf_cross_test, is_pt_tf_cross_test,
is_staging_test, is_staging_test,
@@ -683,149 +683,119 @@ class TFModelPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-tf-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = TFBertModel(config)
) # Make sure model is properly initialized
model = TFBertModel(config) model.build_in_name_scope()
# Make sure model is properly initialized
model.build_in_name_scope()
logging.set_verbosity_info() logging.set_verbosity_info()
logger = logging.get_logger("transformers.utils.hub") logger = logging.get_logger("transformers.utils.hub")
with CaptureLogger(logger) as cl: with CaptureLogger(logger) as cl:
model.push_to_hub(tmp_repo, token=self._token) model.push_to_hub(tmp_repo.repo_id, token=self._token)
logging.set_verbosity_warning() logging.set_verbosity_warning()
# Check the model card was created and uploaded. # Check the model card was created and uploaded.
self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out) self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
new_model = TFBertModel.from_pretrained(tmp_repo) new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
models_equal = True models_equal = True
for p1, p2 in zip(model.weights, new_model.weights): for p1, p2 in zip(model.weights, new_model.weights):
if not tf.math.reduce_all(p1 == p2): if not tf.math.reduce_all(p1 == p2):
models_equal = False models_equal = False
break break
self.assertTrue(models_equal) self.assertTrue(models_equal)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-tf-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = TFBertModel(config)
) # Make sure model is properly initialized
model = TFBertModel(config) model.build_in_name_scope()
# Make sure model is properly initialized
model.build_in_name_scope()
# Push to hub via save_pretrained # Push to hub via save_pretrained
model.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token) with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
new_model = TFBertModel.from_pretrained(tmp_repo) new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
models_equal = True models_equal = True
for p1, p2 in zip(model.weights, new_model.weights): for p1, p2 in zip(model.weights, new_model.weights):
if not tf.math.reduce_all(p1 == p2): if not tf.math.reduce_all(p1 == p2):
models_equal = False models_equal = False
break break
self.assertTrue(models_equal) self.assertTrue(models_equal)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@is_pt_tf_cross_test @is_pt_tf_cross_test
def test_push_to_hub_callback(self): def test_push_to_hub_callback(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-tf-callback-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = TFBertForMaskedLM(config)
) model.compile()
model = TFBertForMaskedLM(config)
model.compile()
with tempfile.TemporaryDirectory() as tmp_dir:
push_to_hub_callback = PushToHubCallback( push_to_hub_callback = PushToHubCallback(
output_dir=tmp_dir, output_dir=tmp_dir,
hub_model_id=tmp_repo, hub_model_id=tmp_repo.repo_id,
hub_token=self._token, hub_token=self._token,
) )
model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback]) model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
new_model = TFBertForMaskedLM.from_pretrained(tmp_repo) new_model = TFBertForMaskedLM.from_pretrained(tmp_repo.repo_id)
models_equal = True models_equal = True
for p1, p2 in zip(model.weights, new_model.weights): for p1, p2 in zip(model.weights, new_model.weights):
if not tf.math.reduce_all(p1 == p2): if not tf.math.reduce_all(p1 == p2):
models_equal = False models_equal = False
break break
self.assertTrue(models_equal) self.assertTrue(models_equal)
tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters) tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
tf_push_to_hub_params.pop("base_model_card_args") tf_push_to_hub_params.pop("base_model_card_args")
pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters) pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
pt_push_to_hub_params.pop("deprecated_kwargs") pt_push_to_hub_params.pop("deprecated_kwargs")
self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params) self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-model-tf-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = TFBertModel(config)
) # Make sure model is properly initialized
model = TFBertModel(config) model.build_in_name_scope()
# Make sure model is properly initialized
model.build_in_name_scope()
model.push_to_hub(tmp_repo, token=self._token) model.push_to_hub(tmp_repo.repo_id, token=self._token)
new_model = TFBertModel.from_pretrained(tmp_repo) new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
models_equal = True models_equal = True
for p1, p2 in zip(model.weights, new_model.weights): for p1, p2 in zip(model.weights, new_model.weights):
if not tf.math.reduce_all(p1 == p2): if not tf.math.reduce_all(p1 == p2):
models_equal = False models_equal = False
break break
self.assertTrue(models_equal) self.assertTrue(models_equal)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-model-tf-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = TFBertModel(config)
) # Make sure model is properly initialized
model = TFBertModel(config) model.build_in_name_scope()
# Make sure model is properly initialized
model.build_in_name_scope()
# Push to hub via save_pretrained # Push to hub via save_pretrained
model.save_pretrained(tmp_dir, push_to_hub=True, token=self._token, repo_id=tmp_repo) with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, push_to_hub=True, token=self._token, repo_id=tmp_repo.repo_id)
new_model = TFBertModel.from_pretrained(tmp_repo) new_model = TFBertModel.from_pretrained(tmp_repo.repo_id)
models_equal = True models_equal = True
for p1, p2 in zip(model.weights, new_model.weights): for p1, p2 in zip(model.weights, new_model.weights):
if not tf.math.reduce_all(p1 == p2): if not tf.math.reduce_all(p1 == p2):
models_equal = False models_equal = False
break break
self.assertTrue(models_equal) self.assertTrue(models_equal)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)

View File

@@ -28,7 +28,7 @@ import warnings
from pathlib import Path from pathlib import Path
import requests import requests
from huggingface_hub import HfApi, HfFolder, delete_repo from huggingface_hub import HfApi, HfFolder
from pytest import mark from pytest import mark
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
@@ -44,9 +44,9 @@ from transformers import (
) )
from transformers.testing_utils import ( from transformers.testing_utils import (
TOKEN, TOKEN,
USER,
CaptureLogger, CaptureLogger,
LoggingLevel, LoggingLevel,
TemporaryHubRepo,
TestCasePlus, TestCasePlus,
is_staging_test, is_staging_test,
require_accelerate, require_accelerate,
@@ -2000,168 +2000,127 @@ class ModelPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
@unittest.skip(reason="This test is flaky") @unittest.skip(reason="This test is flaky")
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = BertModel(config)
) model.push_to_hub(tmp_repo.repo_id, token=self._token)
model = BertModel(config)
model.push_to_hub(tmp_repo, token=self._token)
new_model = BertModel.from_pretrained(tmp_repo) new_model = BertModel.from_pretrained(tmp_repo.repo_id)
for p1, p2 in zip(model.parameters(), new_model.parameters()): for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2)) self.assertTrue(torch.equal(p1, p2))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@unittest.skip(reason="This test is flaky") @unittest.skip(reason="This test is flaky")
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = BertModel(config)
) # Push to hub via save_pretrained
model = BertModel(config) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained model.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
model.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token)
new_model = BertModel.from_pretrained(tmp_repo) new_model = BertModel.from_pretrained(tmp_repo.repo_id)
for p1, p2 in zip(model.parameters(), new_model.parameters()): for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2)) self.assertTrue(torch.equal(p1, p2))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_with_description(self): def test_push_to_hub_with_description(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"{USER}/test-model-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = BertModel(config)
) COMMIT_DESCRIPTION = """
model = BertModel(config)
COMMIT_DESCRIPTION = """
The commit description supports markdown synthax see: The commit description supports markdown synthax see:
```python ```python
>>> form transformers import AutoConfig >>> form transformers import AutoConfig
>>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased") >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
``` ```
""" """
commit_details = model.push_to_hub( commit_details = model.push_to_hub(
tmp_repo, use_auth_token=self._token, create_pr=True, commit_description=COMMIT_DESCRIPTION tmp_repo.repo_id, use_auth_token=self._token, create_pr=True, commit_description=COMMIT_DESCRIPTION
) )
self.assertEqual(commit_details.commit_description, COMMIT_DESCRIPTION) self.assertEqual(commit_details.commit_description, COMMIT_DESCRIPTION)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@unittest.skip(reason="This test is flaky") @unittest.skip(reason="This test is flaky")
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-model-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = BertModel(config)
) model.push_to_hub(tmp_repo.repo_id, token=self._token)
model = BertModel(config)
model.push_to_hub(tmp_repo, token=self._token)
new_model = BertModel.from_pretrained(tmp_repo) new_model = BertModel.from_pretrained(tmp_repo.repo_id)
for p1, p2 in zip(model.parameters(), new_model.parameters()): for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2)) self.assertTrue(torch.equal(p1, p2))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@unittest.skip(reason="This test is flaky") @unittest.skip(reason="This test is flaky")
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: config = BertConfig(
tmp_repo = f"valid_org/test-model-org-{Path(tmp_dir).name}" vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
config = BertConfig( )
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37 model = BertModel(config)
) # Push to hub via save_pretrained
model = BertModel(config) with tempfile.TemporaryDirectory() as tmp_dir:
# Push to hub via save_pretrained model.save_pretrained(tmp_dir, push_to_hub=True, token=self._token, repo_id=tmp_repo.repo_id)
model.save_pretrained(tmp_dir, push_to_hub=True, token=self._token, repo_id=tmp_repo)
new_model = BertModel.from_pretrained(tmp_repo) new_model = BertModel.from_pretrained(tmp_repo.repo_id)
for p1, p2 in zip(model.parameters(), new_model.parameters()): for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2)) self.assertTrue(torch.equal(p1, p2))
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_dynamic_model(self): def test_push_to_hub_dynamic_model(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: CustomConfig.register_for_auto_class()
tmp_repo = f"{USER}/test-dynamic-model-{Path(tmp_dir).name}" CustomModel.register_for_auto_class()
CustomConfig.register_for_auto_class()
CustomModel.register_for_auto_class()
config = CustomConfig(hidden_size=32) config = CustomConfig(hidden_size=32)
model = CustomModel(config) model = CustomModel(config)
model.push_to_hub(tmp_repo, token=self._token) model.push_to_hub(tmp_repo.repo_id, token=self._token)
# checks # checks
self.assertDictEqual( self.assertDictEqual(
config.auto_map, config.auto_map,
{"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"}, {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
) )
new_model = AutoModel.from_pretrained(tmp_repo, trust_remote_code=True) new_model = AutoModel.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
self.assertEqual(new_model.__class__.__name__, "CustomModel") self.assertEqual(new_model.__class__.__name__, "CustomModel")
for p1, p2 in zip(model.parameters(), new_model.parameters()): for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2)) self.assertTrue(torch.equal(p1, p2))
config = AutoConfig.from_pretrained(tmp_repo, trust_remote_code=True) config = AutoConfig.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
new_model = AutoModel.from_config(config, trust_remote_code=True) new_model = AutoModel.from_config(config, trust_remote_code=True)
self.assertEqual(new_model.__class__.__name__, "CustomModel") self.assertEqual(new_model.__class__.__name__, "CustomModel")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_with_tags(self): def test_push_to_hub_with_tags(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: from huggingface_hub import ModelCard
tmp_repo = f"{USER}/test-dynamic-model-with-tags-{Path(tmp_dir).name}"
from huggingface_hub import ModelCard
new_tags = ["tag-1", "tag-2"] new_tags = ["tag-1", "tag-2"]
CustomConfig.register_for_auto_class() CustomConfig.register_for_auto_class()
CustomModel.register_for_auto_class() CustomModel.register_for_auto_class()
config = CustomConfig(hidden_size=32) config = CustomConfig(hidden_size=32)
model = CustomModel(config) model = CustomModel(config)
self.assertTrue(model.model_tags is None) self.assertTrue(model.model_tags is None)
model.add_model_tags(new_tags) model.add_model_tags(new_tags)
self.assertTrue(model.model_tags == new_tags) self.assertTrue(model.model_tags == new_tags)
model.push_to_hub(tmp_repo, token=self._token) model.push_to_hub(tmp_repo.repo_id, token=self._token)
loaded_model_card = ModelCard.load(tmp_repo) loaded_model_card = ModelCard.load(tmp_repo.repo_id)
self.assertEqual(loaded_model_card.data.tags, new_tags) self.assertEqual(loaded_model_card.data.tags, new_tags)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@require_torch @require_torch

View File

@@ -20,7 +20,7 @@ import unittest
import unittest.mock as mock import unittest.mock as mock
from pathlib import Path from pathlib import Path
from huggingface_hub import HfFolder, delete_repo from huggingface_hub import HfFolder
from huggingface_hub.file_download import http_get from huggingface_hub.file_download import http_get
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
@@ -32,7 +32,7 @@ from transformers import (
GPT2TokenizerFast, GPT2TokenizerFast,
is_tokenizers_available, is_tokenizers_available,
) )
from transformers.testing_utils import TOKEN, USER, is_staging_test, require_tokenizers from transformers.testing_utils import TOKEN, TemporaryHubRepo, is_staging_test, require_tokenizers
from transformers.tokenization_utils import ExtensionsTrie, Trie from transformers.tokenization_utils import ExtensionsTrie, Trie
@@ -118,114 +118,84 @@ class TokenizerPushToHubTester(unittest.TestCase):
cls._token = TOKEN cls._token = TOKEN
HfFolder.save_token(TOKEN) HfFolder.save_token(TOKEN)
@staticmethod
def _try_delete_repo(repo_id, token):
try:
# Reset repo
delete_repo(repo_id=repo_id, token=token)
except: # noqa E722
pass
def test_push_to_hub(self): def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_repo = f"{USER}/test-tokenizer-{Path(tmp_dir).name}"
vocab_file = os.path.join(tmp_dir, "vocab.txt") vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer: with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = BertTokenizer(vocab_file) tokenizer = BertTokenizer(vocab_file)
tokenizer.push_to_hub(tmp_repo, token=self._token) tokenizer.push_to_hub(tmp_repo.repo_id, token=self._token)
new_tokenizer = BertTokenizer.from_pretrained(tmp_repo) new_tokenizer = BertTokenizer.from_pretrained(tmp_repo.repo_id)
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab) self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_via_save_pretrained(self): def test_push_to_hub_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_repo = f"{USER}/test-tokenizer-{Path(tmp_dir).name}"
vocab_file = os.path.join(tmp_dir, "vocab.txt") vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer: with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = BertTokenizer(vocab_file) tokenizer = BertTokenizer(vocab_file)
# Push to hub via save_pretrained # Push to hub via save_pretrained
tokenizer.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token) tokenizer.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
new_tokenizer = BertTokenizer.from_pretrained(tmp_repo) new_tokenizer = BertTokenizer.from_pretrained(tmp_repo.repo_id)
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab) self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization(self): def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_repo = f"valid_org/test-tokenizer-{Path(tmp_dir).name}"
vocab_file = os.path.join(tmp_dir, "vocab.txt") vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer: with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = BertTokenizer(vocab_file) tokenizer = BertTokenizer(vocab_file)
tokenizer.push_to_hub(tmp_repo, token=self._token) tokenizer.push_to_hub(tmp_repo.repo_id, token=self._token)
new_tokenizer = BertTokenizer.from_pretrained(tmp_repo) new_tokenizer = BertTokenizer.from_pretrained(tmp_repo.repo_id)
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab) self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
def test_push_to_hub_in_organization_via_save_pretrained(self): def test_push_to_hub_in_organization_via_save_pretrained(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
try: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_repo = f"valid_org/test-tokenizer-{Path(tmp_dir).name}"
vocab_file = os.path.join(tmp_dir, "vocab.txt") vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer: with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = BertTokenizer(vocab_file) tokenizer = BertTokenizer(vocab_file)
# Push to hub via save_pretrained # Push to hub via save_pretrained
tokenizer.save_pretrained(tmp_dir, repo_id=tmp_repo, push_to_hub=True, token=self._token) tokenizer.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
new_tokenizer = BertTokenizer.from_pretrained(tmp_repo) new_tokenizer = BertTokenizer.from_pretrained(tmp_repo.repo_id)
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab) self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@require_tokenizers @require_tokenizers
def test_push_to_hub_dynamic_tokenizer(self): def test_push_to_hub_dynamic_tokenizer(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: CustomTokenizer.register_for_auto_class()
tmp_repo = f"{USER}/test-dynamic-tokenizer-{Path(tmp_dir).name}" with tempfile.TemporaryDirectory() as tmp_dir:
CustomTokenizer.register_for_auto_class()
vocab_file = os.path.join(tmp_dir, "vocab.txt") vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer: with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file) tokenizer = CustomTokenizer(vocab_file)
# No fast custom tokenizer # No fast custom tokenizer
tokenizer.push_to_hub(tmp_repo, token=self._token) tokenizer.push_to_hub(tmp_repo.repo_id, token=self._token)
tokenizer = AutoTokenizer.from_pretrained(tmp_repo, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer") self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
@require_tokenizers @require_tokenizers
def test_push_to_hub_dynamic_tokenizer_with_both_slow_and_fast_classes(self): def test_push_to_hub_dynamic_tokenizer_with_both_slow_and_fast_classes(self):
with tempfile.TemporaryDirectory() as tmp_dir: with TemporaryHubRepo(token=self._token) as tmp_repo:
try: CustomTokenizer.register_for_auto_class()
tmp_repo = f"{USER}/test-dynamic-tokenizer-{Path(tmp_dir).name}"
CustomTokenizer.register_for_auto_class()
# Fast and slow custom tokenizer # Fast and slow custom tokenizer
CustomTokenizerFast.register_for_auto_class() CustomTokenizerFast.register_for_auto_class()
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt") vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer: with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
@@ -234,17 +204,14 @@ class TokenizerPushToHubTester(unittest.TestCase):
bert_tokenizer.save_pretrained(tmp_dir) bert_tokenizer.save_pretrained(tmp_dir)
tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir) tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
tokenizer.push_to_hub(tmp_repo, token=self._token) tokenizer.push_to_hub(tmp_repo.repo_id, token=self._token)
tokenizer = AutoTokenizer.from_pretrained(tmp_repo, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast") self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
tokenizer = AutoTokenizer.from_pretrained(tmp_repo, use_fast=False, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(tmp_repo.repo_id, use_fast=False, trust_remote_code=True)
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer") self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
finally:
# Always (try to) delete the repo.
self._try_delete_repo(repo_id=tmp_repo, token=self._token)
class TrieTest(unittest.TestCase): class TrieTest(unittest.TestCase):