transformers-cli: LFS multipart uploads (> 5GB) (#8663)

* initial commit

* [cli] lfs commands

* Fix FileSlice

* Tweak to FileSlice

* [hf_api] Backport filetype arg from `datasets`

cc @lhoestq

* Silm down the CI while i'm working

* Ok let's try this in CI

* Update config.yml

* Do not try this at home

* one more try

* Update lfs.py

* Revert "Tweak to FileSlice"

This reverts commit d7e32c4b3500400486411e85a2b74e57fb6b52f5.

* Update test_hf_api.py

* Update test_hf_api.py

* Update test_hf_api.py

* CI still green?

* make CI green again?

* Update test_hf_api.py

* make CI red again?

* Update test_hf_api.py

* add CI style back

* Fix CI?

* oh my

* doc + switch back to real staging endpoint

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>

* Fix docblock + f-strings

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>
This commit is contained in:
Julien Chaumond
2020-12-07 22:38:39 +01:00
committed by GitHub
parent 0bce7c5508
commit 28fa014a1f
6 changed files with 372 additions and 18 deletions

View File

@@ -15,12 +15,15 @@
import os
import shutil
import subprocess
import time
import unittest
import requests
from requests.exceptions import HTTPError
from transformers.hf_api import HfApi, HfFolder, ModelInfo, PresignedUrl, RepoObj, S3Obj
from transformers.testing_utils import require_git_lfs
USER = "__DUMMY_TRANSFORMERS_USER__"
@@ -35,8 +38,14 @@ FILES = [
os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"),
),
]
REPO_NAME = "my-model-{}".format(int(time.time()))
ENDPOINT_STAGING = "https://moon-staging.huggingface.co"
ENDPOINT_STAGING_BASIC_AUTH = f"https://{USER}:{PASS}@moon-staging.huggingface.co"
REPO_NAME = "my-model-{}".format(int(time.time()))
REPO_NAME_LARGE_FILE = "my-model-largefiles-{}".format(int(time.time()))
WORKING_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo")
LARGE_FILE_14MB = "https://cdn-media.huggingface.co/lfs-largefiles/progit.epub"
LARGE_FILE_18MB = "https://cdn-media.huggingface.co/lfs-largefiles/progit.pdf"
class HfApiCommonTest(unittest.TestCase):
@@ -64,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
@classmethod
def tearDownClass(cls):
for FILE_KEY, FILE_PATH in FILES:
cls._api.delete_obj(token=cls._token, filename=FILE_KEY)
cls._api.delete_obj(token=cls._token, filetype="datasets", filename=FILE_KEY)
def test_whoami(self):
user, orgs = self._api.whoami(token=self._token)
@@ -73,21 +82,27 @@ class HfApiEndpointsTest(HfApiCommonTest):
def test_presign_invalid_org(self):
with self.assertRaises(HTTPError):
_ = self._api.presign(token=self._token, filename="nested/fake_org.txt", organization="fake")
_ = self._api.presign(
token=self._token, filetype="datasets", filename="nested/fake_org.txt", organization="fake"
)
def test_presign_valid_org(self):
urls = self._api.presign(token=self._token, filename="nested/valid_org.txt", organization="valid_org")
urls = self._api.presign(
token=self._token, filetype="datasets", filename="nested/valid_org.txt", organization="valid_org"
)
self.assertIsInstance(urls, PresignedUrl)
def test_presign(self):
for FILE_KEY, FILE_PATH in FILES:
urls = self._api.presign(token=self._token, filename=FILE_KEY)
urls = self._api.presign(token=self._token, filetype="datasets", filename=FILE_KEY)
self.assertIsInstance(urls, PresignedUrl)
self.assertEqual(urls.type, "text/plain")
def test_presign_and_upload(self):
for FILE_KEY, FILE_PATH in FILES:
access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
access_url = self._api.presign_and_upload(
token=self._token, filetype="datasets", filename=FILE_KEY, filepath=FILE_PATH
)
self.assertIsInstance(access_url, str)
with open(FILE_PATH, "r") as f:
body = f.read()
@@ -95,7 +110,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
self.assertEqual(r.text, body)
def test_list_objs(self):
objs = self._api.list_objs(token=self._token)
objs = self._api.list_objs(token=self._token, filetype="datasets")
self.assertIsInstance(objs, list)
if len(objs) > 0:
o = objs[-1]
@@ -108,7 +123,6 @@ class HfApiEndpointsTest(HfApiCommonTest):
o = objs[-1]
self.assertIsInstance(o, RepoObj)
@unittest.skip("Until @julien-c or @pierrci debugs")
def test_create_and_delete_repo(self):
self._api.create_repo(token=self._token, name=REPO_NAME)
self._api.delete_repo(token=self._token, name=REPO_NAME)
@@ -140,3 +154,75 @@ class HfFolderTest(unittest.TestCase):
# ^^ not an error, we test that the
# second call does not fail.
self.assertEqual(HfFolder.get_token(), None)
@require_git_lfs
class HfLargefilesTest(HfApiCommonTest):
@classmethod
def setUpClass(cls):
"""
Share this valid token in all tests below.
"""
cls._token = cls._api.login(username=USER, password=PASS)
def setUp(self):
try:
shutil.rmtree(WORKING_REPO_DIR)
except FileNotFoundError:
pass
def tearDown(self):
self._api.delete_repo(token=self._token, name=REPO_NAME_LARGE_FILE)
def setup_local_clone(self, REMOTE_URL):
REMOTE_URL_AUTH = REMOTE_URL.replace(ENDPOINT_STAGING, ENDPOINT_STAGING_BASIC_AUTH)
subprocess.run(["git", "clone", REMOTE_URL_AUTH, WORKING_REPO_DIR], check=True, capture_output=True)
subprocess.run(["git", "lfs", "track", "*.pdf"], check=True, cwd=WORKING_REPO_DIR)
subprocess.run(["git", "lfs", "track", "*.epub"], check=True, cwd=WORKING_REPO_DIR)
def test_end_to_end_thresh_6M(self):
REMOTE_URL = self._api.create_repo(
token=self._token, name=REPO_NAME_LARGE_FILE, lfsmultipartthresh=6 * 10 ** 6
)
self.setup_local_clone(REMOTE_URL)
subprocess.run(["wget", LARGE_FILE_18MB], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
subprocess.run(["git", "add", "*"], check=True, cwd=WORKING_REPO_DIR)
subprocess.run(["git", "commit", "-m", "commit message"], check=True, cwd=WORKING_REPO_DIR)
# This will fail as we haven't set up our custom transfer agent yet.
failed_process = subprocess.run(["git", "push"], capture_output=True, cwd=WORKING_REPO_DIR)
self.assertEqual(failed_process.returncode, 1)
self.assertIn("transformers-cli lfs-enable-largefiles", failed_process.stderr.decode())
# ^ Instructions on how to fix this are included in the error message.
subprocess.run(["transformers-cli", "lfs-enable-largefiles", WORKING_REPO_DIR], check=True)
start_time = time.time()
subprocess.run(["git", "push"], check=True, cwd=WORKING_REPO_DIR)
print("took", time.time() - start_time)
# To be 100% sure, let's download the resolved file
pdf_url = f"{REMOTE_URL}/resolve/main/progit.pdf"
DEST_FILENAME = "uploaded.pdf"
subprocess.run(["wget", pdf_url, "-O", DEST_FILENAME], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
dest_filesize = os.stat(os.path.join(WORKING_REPO_DIR, DEST_FILENAME)).st_size
self.assertEqual(dest_filesize, 18685041)
def test_end_to_end_thresh_16M(self):
# Here we'll push one multipart and one non-multipart file in the same commit, and see what happens
REMOTE_URL = self._api.create_repo(
token=self._token, name=REPO_NAME_LARGE_FILE, lfsmultipartthresh=16 * 10 ** 6
)
self.setup_local_clone(REMOTE_URL)
subprocess.run(["wget", LARGE_FILE_18MB], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
subprocess.run(["wget", LARGE_FILE_14MB], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
subprocess.run(["git", "add", "*"], check=True, cwd=WORKING_REPO_DIR)
subprocess.run(["git", "commit", "-m", "both files in same commit"], check=True, cwd=WORKING_REPO_DIR)
subprocess.run(["transformers-cli", "lfs-enable-largefiles", WORKING_REPO_DIR], check=True)
start_time = time.time()
subprocess.run(["git", "push"], check=True, cwd=WORKING_REPO_DIR)
print("took", time.time() - start_time)