Trainer push to hub (#11328)

* Initial support for upload to hub

* push -> upload

* Fixes + examples

* Fix torchhub test

* Torchhub test I hate you

* push_model_to_hub -> push_to_hub

* Apply mixin to other pretrained models

* Remove ABC inheritance

* Add tests

* Typo

* Run tests

* Install git-lfs

* Change approach

* Add push_to_hub to all

* Staging test suite

* Typo

* Maybe like this?

* More deps

* Cache

* Adapt name

* Quality

* MOAR tests

* Put it in testing_utils

* Docs + torchhub last hope

* Styling

* Wrong method

* Typos

* Update src/transformers/file_utils.py

Co-authored-by: Julien Chaumond <julien@huggingface.co>

* Address review comments

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Co-authored-by: Julien Chaumond <julien@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
Sylvain Gugger
2021-04-23 09:17:37 -04:00
committed by GitHub
parent 7bc86bea68
commit bf2e0cf70b
31 changed files with 766 additions and 31 deletions

View File

@@ -38,6 +38,7 @@ def pytest_configure(config):
config.addinivalue_line(
"markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
)
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
def pytest_addoption(parser):

View File

@@ -17,6 +17,12 @@
import json
import os
import tempfile
import unittest
from huggingface_hub import HfApi
from requests.exceptions import HTTPError
from transformers import BertConfig
from transformers.testing_utils import ENDPOINT_STAGING, PASS, USER, is_staging_test
class ConfigTester(object):
@@ -81,3 +87,54 @@ class ConfigTester(object):
self.create_and_test_config_from_and_save_pretrained()
self.create_and_test_config_with_num_labels()
self.check_config_can_be_init_without_params()
@is_staging_test
class ConfigPushToHubTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._api = HfApi(endpoint=ENDPOINT_STAGING)
cls._token = cls._api.login(username=USER, password=PASS)
@classmethod
def tearDownClass(cls):
try:
cls._api.delete_repo(token=cls._token, name="test-model")
except HTTPError:
pass
try:
cls._api.delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
except HTTPError:
pass
def test_push_to_hub(self):
config = BertConfig(
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-model", use_auth_token=self._token)
new_config = BertConfig.from_pretrained(f"{USER}/test-model")
for k, v in config.__dict__.items():
if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k))
def test_push_to_hub_in_organization(self):
config = BertConfig(
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(
tmp_dir,
push_to_hub=True,
repo_name="test-model-org",
use_auth_token=self._token,
organization="valid_org",
)
new_config = BertConfig.from_pretrained("valid_org/test-model-org")
for k, v in config.__dict__.items():
if k != "transformers_version":
self.assertEqual(v, getattr(new_config, k))

View File

@@ -22,13 +22,9 @@ import unittest
from requests.exceptions import HTTPError
from transformers.hf_api import HfApi, HfFolder, ModelInfo, RepoObj
from transformers.testing_utils import require_git_lfs
from transformers.testing_utils import ENDPOINT_STAGING, PASS, USER, is_staging_test, require_git_lfs
USER = "__DUMMY_TRANSFORMERS_USER__"
PASS = "__DUMMY_TRANSFORMERS_PASS__"
ENDPOINT_STAGING = "https://moon-staging.huggingface.co"
ENDPOINT_STAGING_BASIC_AUTH = f"https://{USER}:{PASS}@moon-staging.huggingface.co"
REPO_NAME = f"my-model-{int(time.time())}"
@@ -106,6 +102,7 @@ class HfFolderTest(unittest.TestCase):
@require_git_lfs
@is_staging_test
class HfLargefilesTest(HfApiCommonTest):
@classmethod
def setUpClass(cls):

View File

@@ -22,10 +22,22 @@ import tempfile
import unittest
from typing import List, Tuple
from huggingface_hub import HfApi
from requests.exceptions import HTTPError
from transformers import is_torch_available, logging
from transformers.file_utils import WEIGHTS_NAME
from transformers.models.auto import get_values
from transformers.testing_utils import CaptureLogger, require_torch, require_torch_multi_gpu, slow, torch_device
from transformers.testing_utils import (
ENDPOINT_STAGING,
PASS,
USER,
CaptureLogger,
is_staging_test,
require_torch,
require_torch_multi_gpu,
slow,
torch_device,
)
if is_torch_available():
@@ -1300,3 +1312,54 @@ class ModelUtilsTest(unittest.TestCase):
with CaptureLogger(logger) as cl:
BertModel.from_pretrained(TINY_T5)
self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
@require_torch
@is_staging_test
class ModelPushToHubTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._api = HfApi(endpoint=ENDPOINT_STAGING)
cls._token = cls._api.login(username=USER, password=PASS)
@classmethod
def tearDownClass(cls):
try:
cls._api.delete_repo(token=cls._token, name="test-model")
except HTTPError:
pass
try:
cls._api.delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
except HTTPError:
pass
def test_push_to_hub(self):
config = BertConfig(
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
model = BertModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-model", use_auth_token=self._token)
new_model = BertModel.from_pretrained(f"{USER}/test-model")
for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
def test_push_to_hub_in_organization(self):
config = BertConfig(
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
model = BertModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(
tmp_dir,
push_to_hub=True,
repo_name="test-model-org",
use_auth_token=self._token,
organization="valid_org",
)
new_model = BertModel.from_pretrained("valid_org/test-model-org")
for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))

View File

@@ -24,11 +24,17 @@ import unittest
from importlib import import_module
from typing import List, Tuple
from huggingface_hub import HfApi
from requests.exceptions import HTTPError
from transformers import is_tf_available
from transformers.models.auto import get_values
from transformers.testing_utils import (
ENDPOINT_STAGING,
PASS,
USER,
_tf_gpu_memory_limit,
is_pt_tf_cross_test,
is_staging_test,
require_onnx,
require_tf,
slow,
@@ -50,6 +56,8 @@ if is_tf_available():
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
BertConfig,
TFBertModel,
TFSharedEmbeddings,
tf_top_k_top_p_filtering,
)
@@ -1326,3 +1334,62 @@ class UtilsFunctionsTest(unittest.TestCase):
tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
@require_tf
@is_staging_test
class TFModelPushToHubTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._api = HfApi(endpoint=ENDPOINT_STAGING)
cls._token = cls._api.login(username=USER, password=PASS)
@classmethod
def tearDownClass(cls):
try:
cls._api.delete_repo(token=cls._token, name="test-model")
except HTTPError:
pass
try:
cls._api.delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
except HTTPError:
pass
def test_push_to_hub(self):
config = BertConfig(
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
model = TFBertModel(config)
# Make sure model is properly initialized
_ = model(model.dummy_inputs)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-model", use_auth_token=self._token)
new_model = TFBertModel.from_pretrained(f"{USER}/test-model")
models_equal = True
for p1, p2 in zip(model.weights, new_model.weights):
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
models_equal = False
self.assertTrue(models_equal)
def test_push_to_hub_in_organization(self):
config = BertConfig(
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
)
model = TFBertModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(
tmp_dir,
push_to_hub=True,
repo_name="test-model-org",
use_auth_token=self._token,
organization="valid_org",
)
new_model = TFBertModel.from_pretrained("valid_org/test-model-org")
models_equal = True
for p1, p2 in zip(model.weights, new_model.weights):
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
models_equal = False
self.assertTrue(models_equal)

View File

@@ -20,11 +20,15 @@ import pickle
import re
import shutil
import tempfile
import unittest
from collections import OrderedDict
from itertools import takewhile
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
from huggingface_hub import HfApi
from requests.exceptions import HTTPError
from transformers import (
BertTokenizer,
PreTrainedTokenizer,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
@@ -32,8 +36,12 @@ from transformers import (
is_torch_available,
)
from transformers.testing_utils import (
ENDPOINT_STAGING,
PASS,
USER,
get_tests_dir,
is_pt_tf_cross_test,
is_staging_test,
require_tf,
require_tokenizers,
require_torch,
@@ -2863,3 +2871,53 @@ class TokenizerTesterMixin:
)
for key in python_output:
self.assertEqual(python_output[key], rust_output[key])
@is_staging_test
class TokenzierPushToHubTester(unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
@classmethod
def setUpClass(cls):
cls._api = HfApi(endpoint=ENDPOINT_STAGING)
cls._token = cls._api.login(username=USER, password=PASS)
@classmethod
def tearDownClass(cls):
try:
cls._api.delete_repo(token=cls._token, name="test-model")
except HTTPError:
pass
try:
cls._api.delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
except HTTPError:
pass
def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = BertTokenizer(vocab_file)
tokenizer.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-model", use_auth_token=self._token)
new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-model")
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = BertTokenizer(vocab_file)
tokenizer.save_pretrained(
tmp_dir,
push_to_hub=True,
repo_name="test-model-org",
use_auth_token=self._token,
organization="valid_org",
)
new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-model-org")
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)

View File

@@ -16,16 +16,23 @@
import dataclasses
import gc
import os
import re
import tempfile
import unittest
import numpy as np
from huggingface_hub import HfApi
from requests.exceptions import HTTPError
from transformers import AutoTokenizer, IntervalStrategy, PretrainedConfig, TrainingArguments, is_torch_available
from transformers.file_utils import WEIGHTS_NAME
from transformers.testing_utils import (
ENDPOINT_STAGING,
PASS,
USER,
TestCasePlus,
get_tests_dir,
is_staging_test,
require_datasets,
require_optuna,
require_ray,
@@ -1081,6 +1088,60 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
@require_torch
@is_staging_test
class TrainerIntegrationWithHubTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._api = HfApi(endpoint=ENDPOINT_STAGING)
cls._token = cls._api.login(username=USER, password=PASS)
@classmethod
def tearDownClass(cls):
try:
cls._api.delete_repo(token=cls._token, name="test-model")
except HTTPError:
pass
try:
cls._api.delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
except HTTPError:
pass
def test_push_to_hub(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(output_dir=tmp_dir)
trainer.save_model()
url = trainer.push_to_hub(repo_name="test-model", use_auth_token=self._token)
# Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0]
self.assertEqual(repo_name, f"{USER}/test-model")
model = RegressionPreTrainedModel.from_pretrained(repo_name)
self.assertEqual(model.a.item(), trainer.model.a.item())
self.assertEqual(model.b.item(), trainer.model.b.item())
def test_push_to_hub_in_organization(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(output_dir=tmp_dir)
trainer.save_model()
url = trainer.push_to_hub(repo_name="test-model-org", organization="valid_org", use_auth_token=self._token)
# Extract repo_name from the url
re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
self.assertTrue(re_search is not None)
repo_name = re_search.groups()[0]
self.assertEqual(repo_name, "valid_org/test-model-org")
model = RegressionPreTrainedModel.from_pretrained("valid_org/test-model-org")
self.assertEqual(model.a.item(), trainer.model.a.item())
self.assertEqual(model.b.item(), trainer.model.b.item())
@require_torch
@require_optuna
class TrainerHyperParameterOptunaIntegrationTest(unittest.TestCase):