* [file_utils] use_cdn + documentation

* Move to cdn. urls for weights

* [urls] Hotfix for bert-base-japanese
This commit is contained in:
Julien Chaumond
2020-04-28 20:27:14 -04:00
committed by GitHub
parent 8ba4c5885f
commit 455c639093
39 changed files with 209 additions and 189 deletions

View File

@@ -94,7 +94,7 @@ DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
def is_torch_available():
@@ -144,12 +144,28 @@ def is_remote_url(url_or_filename):
return parsed.scheme in ("http", "https")
def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
if postfix is None:
return "/".join((endpoint, identifier))
def hf_bucket_url(model_id: str, filename: str, use_cdn=True) -> str:
"""
Resolve a model identifier, and a file name, to a HF-hosted url
on either S3 or Cloudfront (a Content Delivery Network, or CDN).
Cloudfront is replicated over the globe so downloads are way faster
for the end user (and it also lowers our bandwidth costs). However, it
is more aggressively cached by default, so may not always reflect the
latest changes to the underlying file (default TTL is 24 hours).
In terms of client-side caching from this library, even though
Cloudfront relays the ETags from S3, using one or the other
(or switching from one to the other) will affect caching: cached files
are not shared between the two because the cached file's name contains
a hash of the url.
"""
endpoint = CLOUDFRONT_DISTRIB_PREFIX if use_cdn else S3_BUCKET_PREFIX
legacy_format = "/" not in model_id
if legacy_format:
return f"{endpoint}/{model_id}-{filename}"
else:
return "/".join((endpoint, identifier, postfix))
return f"{endpoint}/{model_id}/{filename}"
def url_to_filename(url, etag=None):