adding proxies options for the from_pretrained methods
This commit is contained in:
@@ -17,8 +17,9 @@ from hashlib import sha256
|
||||
from io import open
|
||||
|
||||
import boto3
|
||||
import requests
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
try:
|
||||
@@ -93,7 +94,7 @@ def filename_to_url(filename, cache_dir=None):
|
||||
return url, etag
|
||||
|
||||
|
||||
def cached_path(url_or_filename, cache_dir=None, force_download=False):
|
||||
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
|
||||
"""
|
||||
Given something that might be a URL (or might be a local path),
|
||||
determine which. If it's a URL, download the file and cache it, and
|
||||
@@ -114,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False):
|
||||
|
||||
if parsed.scheme in ('http', 'https', 's3'):
|
||||
# URL, so get it from the cache (downloading if necessary)
|
||||
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download)
|
||||
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
elif os.path.exists(url_or_filename):
|
||||
# File, and it exists.
|
||||
return url_or_filename
|
||||
@@ -159,24 +160,24 @@ def s3_request(func):
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_etag(url):
|
||||
def s3_etag(url, proxies=None):
|
||||
"""Check ETag on S3 object."""
|
||||
s3_resource = boto3.resource("s3")
|
||||
s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_object = s3_resource.Object(bucket_name, s3_path)
|
||||
return s3_object.e_tag
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_get(url, temp_file):
|
||||
def s3_get(url, temp_file, proxies=None):
|
||||
"""Pull a file directly from S3."""
|
||||
s3_resource = boto3.resource("s3")
|
||||
s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||
|
||||
|
||||
def http_get(url, temp_file):
|
||||
req = requests.get(url, stream=True)
|
||||
def http_get(url, temp_file, proxies=None):
|
||||
req = requests.get(url, stream=True, proxies=proxies)
|
||||
content_length = req.headers.get('Content-Length')
|
||||
total = int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total)
|
||||
@@ -187,7 +188,7 @@ def http_get(url, temp_file):
|
||||
progress.close()
|
||||
|
||||
|
||||
def get_from_cache(url, cache_dir=None, force_download=False):
|
||||
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
|
||||
"""
|
||||
Given a URL, look for the corresponding dataset in the local cache.
|
||||
If it's not there, download it. Then return the path to the cached file.
|
||||
@@ -204,10 +205,10 @@ def get_from_cache(url, cache_dir=None, force_download=False):
|
||||
|
||||
# Get eTag to add to filename, if it exists.
|
||||
if url.startswith("s3://"):
|
||||
etag = s3_etag(url)
|
||||
etag = s3_etag(url, proxies=proxies)
|
||||
else:
|
||||
try:
|
||||
response = requests.head(url, allow_redirects=True)
|
||||
response = requests.head(url, allow_redirects=True, proxies=proxies)
|
||||
if response.status_code != 200:
|
||||
etag = None
|
||||
else:
|
||||
@@ -238,9 +239,9 @@ def get_from_cache(url, cache_dir=None, force_download=False):
|
||||
|
||||
# GET file object
|
||||
if url.startswith("s3://"):
|
||||
s3_get(url, temp_file)
|
||||
s3_get(url, temp_file, proxies=proxies)
|
||||
else:
|
||||
http_get(url, temp_file)
|
||||
http_get(url, temp_file, proxies=proxies)
|
||||
|
||||
# we are copying the file before closing it, so flush to avoid truncation
|
||||
temp_file.flush()
|
||||
|
||||
Reference in New Issue
Block a user