adding proxies options for the from_pretrained methods

2019-08-20 16:59:11 +02:00
parent 6d0aa73981
commit 43489756ad
4 changed files with 36 additions and 18 deletions
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -17,8 +17,9 @@ from hashlib import sha256
 from io import open

 import boto3
-import requests
+from botocore.config import Config
 from botocore.exceptions import ClientError
+import requests
 from tqdm import tqdm

 try:
@@ -93,7 +94,7 @@ def filename_to_url(filename, cache_dir=None):
    return url, etag


-def cached_path(url_or_filename, cache_dir=None, force_download=False):
+def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
@@ -114,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False):

    if parsed.scheme in ('http', 'https', 's3'):
        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download)
+        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        return url_or_filename
@@ -159,24 +160,24 @@ def s3_request(func):


@s3_request
-def s3_etag(url):
+def s3_etag(url, proxies=None):
    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3")
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
    bucket_name, s3_path = split_s3_path(url)
    s3_object = s3_resource.Object(bucket_name, s3_path)
    return s3_object.e_tag


@s3_request
-def s3_get(url, temp_file):
+def s3_get(url, temp_file, proxies=None):
    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3")
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
    bucket_name, s3_path = split_s3_path(url)
    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)


-def http_get(url, temp_file):
-    req = requests.get(url, stream=True)
+def http_get(url, temp_file, proxies=None):
+    req = requests.get(url, stream=True, proxies=proxies)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit="B", total=total)
@@ -187,7 +188,7 @@ def http_get(url, temp_file):
    progress.close()


-def get_from_cache(url, cache_dir=None, force_download=False):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
@@ -204,10 +205,10 @@ def get_from_cache(url, cache_dir=None, force_download=False):

    # Get eTag to add to filename, if it exists.
    if url.startswith("s3://"):
-        etag = s3_etag(url)
+        etag = s3_etag(url, proxies=proxies)
    else:
        try:
-            response = requests.head(url, allow_redirects=True)
+            response = requests.head(url, allow_redirects=True, proxies=proxies)
            if response.status_code != 200:
                etag = None
            else:
@@ -238,9 +239,9 @@ def get_from_cache(url, cache_dir=None, force_download=False):

            # GET file object
            if url.startswith("s3://"):
-                s3_get(url, temp_file)
+                s3_get(url, temp_file, proxies=proxies)
            else:
-                http_get(url, temp_file)
+                http_get(url, temp_file, proxies=proxies)

            # we are copying the file before closing it, so flush to avoid truncation
            temp_file.flush()