transformers-cli: LFS multipart uploads (> 5GB) (#8663)

* initial commit

* [cli] lfs commands

* Fix FileSlice

* Tweak to FileSlice

* [hf_api] Backport filetype arg from `datasets`

cc @lhoestq

* Silm down the CI while i'm working

* Ok let's try this in CI

* Update config.yml

* Do not try this at home

* one more try

* Update lfs.py

* Revert "Tweak to FileSlice"

This reverts commit d7e32c4b3500400486411e85a2b74e57fb6b52f5.

* Update test_hf_api.py

* Update test_hf_api.py

* Update test_hf_api.py

* CI still green?

* make CI green again?

* Update test_hf_api.py

* make CI red again?

* Update test_hf_api.py

* add CI style back

* Fix CI?

* oh my

* doc + switch back to real staging endpoint

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>

* Fix docblock + f-strings

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>
This commit is contained in:
Julien Chaumond
2020-12-07 22:38:39 +01:00
committed by GitHub
parent 0bce7c5508
commit 28fa014a1f
6 changed files with 372 additions and 18 deletions

View File

@@ -95,6 +95,8 @@ class ModelInfo:
class HfApi:
ALLOWED_S3_FILE_TYPES = ["datasets", "metrics"]
def __init__(self, endpoint=None):
self.endpoint = endpoint if endpoint is not None else ENDPOINT
@@ -130,13 +132,14 @@ class HfApi:
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
r.raise_for_status()
def presign(self, token: str, filename: str, organization: Optional[str] = None) -> PresignedUrl:
def presign(self, token: str, filetype: str, filename: str, organization: Optional[str] = None) -> PresignedUrl:
"""
HuggingFace S3-based system, used for datasets and metrics.
Call HF API to get a presigned url to upload `filename` to S3.
"""
path = "{}/api/datasets/presign".format(self.endpoint)
assert filetype in self.ALLOWED_S3_FILE_TYPES, f"Please specify filetype from {self.ALLOWED_S3_FILE_TYPES}"
path = f"{self.endpoint}/api/{filetype}/presign"
r = requests.post(
path,
headers={"authorization": "Bearer {}".format(token)},
@@ -146,7 +149,9 @@ class HfApi:
d = r.json()
return PresignedUrl(**d)
def presign_and_upload(self, token: str, filename: str, filepath: str, organization: Optional[str] = None) -> str:
def presign_and_upload(
self, token: str, filetype: str, filename: str, filepath: str, organization: Optional[str] = None
) -> str:
"""
HuggingFace S3-based system, used for datasets and metrics.
@@ -154,7 +159,8 @@ class HfApi:
Outputs: url: Read-only url for the stored file on S3.
"""
urls = self.presign(token, filename=filename, organization=organization)
assert filetype in self.ALLOWED_S3_FILE_TYPES, f"Please specify filetype from {self.ALLOWED_S3_FILE_TYPES}"
urls = self.presign(token, filetype=filetype, filename=filename, organization=organization)
# streaming upload:
# https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
#
@@ -169,26 +175,28 @@ class HfApi:
pf.close()
return urls.access
def list_objs(self, token: str, organization: Optional[str] = None) -> List[S3Obj]:
def list_objs(self, token: str, filetype: str, organization: Optional[str] = None) -> List[S3Obj]:
"""
HuggingFace S3-based system, used for datasets and metrics.
Call HF API to list all stored files for user (or one of their organizations).
"""
path = "{}/api/datasets/listObjs".format(self.endpoint)
assert filetype in self.ALLOWED_S3_FILE_TYPES, f"Please specify filetype from {self.ALLOWED_S3_FILE_TYPES}"
path = "{}/api/{}/listObjs".format(self.endpoint, filetype)
params = {"organization": organization} if organization is not None else None
r = requests.get(path, params=params, headers={"authorization": "Bearer {}".format(token)})
r.raise_for_status()
d = r.json()
return [S3Obj(**x) for x in d]
def delete_obj(self, token: str, filename: str, organization: Optional[str] = None):
def delete_obj(self, token: str, filetype: str, filename: str, organization: Optional[str] = None):
"""
HuggingFace S3-based system, used for datasets and metrics.
Call HF API to delete a file stored by user
"""
path = "{}/api/datasets/deleteObj".format(self.endpoint)
assert filetype in self.ALLOWED_S3_FILE_TYPES, f"Please specify filetype from {self.ALLOWED_S3_FILE_TYPES}"
path = "{}/api/{}/deleteObj".format(self.endpoint, filetype)
r = requests.delete(
path,
headers={"authorization": "Bearer {}".format(token)},
@@ -219,17 +227,25 @@ class HfApi:
d = r.json()
return [RepoObj(**x) for x in d]
def create_repo(self, token: str, name: str, organization: Optional[str] = None) -> str:
def create_repo(
self, token: str, name: str, organization: Optional[str] = None, lfsmultipartthresh: Optional[int] = None
) -> str:
"""
HuggingFace git-based system, used for models.
Call HF API to create a whole repo.
Params:
lfsmultipartthresh: Optional: internal param for testing purposes.
"""
path = "{}/api/repos/create".format(self.endpoint)
json = {"name": name, "organization": organization}
if lfsmultipartthresh is not None:
json["lfsmultipartthresh"] = lfsmultipartthresh
r = requests.post(
path,
headers={"authorization": "Bearer {}".format(token)},
json={"name": name, "organization": organization},
json=json,
)
r.raise_for_status()
d = r.json()