From d114a6b71f243054db333dc5a3f55816161eb7ea Mon Sep 17 00:00:00 2001
From: Roland Szabo <roland@rolisz.ro>
Date: Thu, 3 Aug 2023 14:51:54 +0000
Subject: [PATCH] Add timeout parameter to load_image function (#25184)

* Add timeout parameter to load_image function.

* Remove line.

* Reformat code

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add parameter to docs.

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/image_utils.py                    |  8 +++++---
 src/transformers/pipelines/depth_estimation.py     | 14 ++++++++++----
 .../pipelines/document_question_answering.py       |  9 ++++++++-
 src/transformers/pipelines/image_classification.py | 14 ++++++++++----
 src/transformers/pipelines/image_segmentation.py   |  9 +++++++--
 src/transformers/pipelines/image_to_text.py        | 11 ++++++++---
 src/transformers/pipelines/mask_generation.py      |  8 +++++++-
 src/transformers/pipelines/object_detection.py     | 12 +++++++++---
 .../pipelines/visual_question_answering.py         | 11 ++++++++---
 .../pipelines/zero_shot_image_classification.py    | 10 ++++++++--
 .../pipelines/zero_shot_object_detection.py        | 13 ++++++++++---
 tests/utils/test_image_utils.py                    | 12 ++++++++++++
 12 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 109b148f34..f06fdb0e6d 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import os
-from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import requests
@@ -253,13 +253,15 @@ def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List,
     return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
 
 
-def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
     """
     Loads `image` to a PIL Image.
 
     Args:
         image (`str` or `PIL.Image.Image`):
             The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
 
     Returns:
         `PIL.Image.Image`: A PIL Image.
@@ -269,7 +271,7 @@ def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
         if image.startswith("http://") or image.startswith("https://"):
             # We need to actually check for a real protocol, otherwise it's impossible to use a local file
             # like http_huggingface_co.png
-            image = PIL.Image.open(requests.get(image, stream=True).raw)
+            image = PIL.Image.open(requests.get(image, stream=True, timeout=timeout).raw)
         elif os.path.isfile(image):
             image = PIL.Image.open(image)
         else:
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index 89b29e8dca..c8d0cad6fc 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -68,6 +68,9 @@ class DepthEstimationPipeline(Pipeline):
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -81,11 +84,14 @@ class DepthEstimationPipeline(Pipeline):
         """
         return super().__call__(images, **kwargs)
 
-    def _sanitize_parameters(self, **kwargs):
-        return {}, {}, {}
+    def _sanitize_parameters(self, timeout=None, **kwargs):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        return preprocess_params, {}, {}
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout)
         self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         return model_inputs
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index e4b96348e7..3c107d650c 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -159,6 +159,7 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
         max_seq_len=None,
         top_k=None,
         handle_impossible_answer=None,
+        timeout=None,
         **kwargs,
     ):
         preprocess_params, postprocess_params = {}, {}
@@ -174,6 +175,8 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
             preprocess_params["lang"] = lang
         if tesseract_config is not None:
             preprocess_params["tesseract_config"] = tesseract_config
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
 
         if top_k is not None:
             if top_k < 1:
@@ -244,6 +247,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
                 Language to use while running OCR. Defaults to english.
             tesseract_config (`str`, *optional*):
                 Additional flags to pass to tesseract while running OCR.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
@@ -273,6 +279,7 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
         word_boxes: Tuple[str, List[float]] = None,
         lang=None,
         tesseract_config="",
+        timeout=None,
     ):
         # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
         # to support documents with enough tokens that overflow the model's window
@@ -285,7 +292,7 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
         image = None
         image_features = {}
         if input.get("image", None) is not None:
-            image = load_image(input["image"])
+            image = load_image(input["image"], timeout=timeout)
             if self.image_processor is not None:
                 image_features.update(self.image_processor(images=image, return_tensors=self.framework))
             elif self.feature_extractor is not None:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index bdda574303..59ebabbd20 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -62,11 +62,14 @@ class ImageClassificationPipeline(Pipeline):
             else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
-    def _sanitize_parameters(self, top_k=None):
+    def _sanitize_parameters(self, top_k=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
         postprocess_params = {}
         if top_k is not None:
             postprocess_params["top_k"] = top_k
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
@@ -86,6 +89,9 @@ class ImageClassificationPipeline(Pipeline):
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -99,8 +105,8 @@ class ImageClassificationPipeline(Pipeline):
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         return model_inputs
 
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index c28946c77c..01540729e5 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -89,6 +89,8 @@ class ImageSegmentationPipeline(Pipeline):
             postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
 
         return preprocess_kwargs, {}, postprocess_kwargs
 
@@ -116,6 +118,9 @@ class ImageSegmentationPipeline(Pipeline):
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                 Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
@@ -133,8 +138,8 @@ class ImageSegmentationPipeline(Pipeline):
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image, subtask=None):
-        image = load_image(image)
+    def preprocess(self, image, subtask=None, timeout=None):
+        image = load_image(image, timeout=timeout)
         target_size = [(image.height, image.width)]
         if self.model.config.__class__.__name__ == "OneFormerConfig":
             if subtask is None:
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index e9f802ab7b..e5cbb36ea5 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -58,12 +58,14 @@ class ImageToTextPipeline(Pipeline):
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
         )
 
-    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None):
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
         forward_kwargs = {}
         preprocess_params = {}
 
         if prompt is not None:
             preprocess_params["prompt"] = prompt
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
 
         if generate_kwargs is not None:
             forward_kwargs["generate_kwargs"] = generate_kwargs
@@ -97,6 +99,9 @@ class ImageToTextPipeline(Pipeline):
 
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
@@ -105,8 +110,8 @@ class ImageToTextPipeline(Pipeline):
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image, prompt=None):
-        image = load_image(image)
+    def preprocess(self, image, prompt=None, timeout=None):
+        image = load_image(image, timeout=timeout)
 
         if prompt is not None:
             if not isinstance(prompt, str):
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index 26c5f7101e..bc2c719084 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -113,6 +113,8 @@ class MaskGenerationPipeline(ChunkPipeline):
             preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
         if "crop_n_points_downscale_factor" in kwargs:
             preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
         # postprocess args
         if "pred_iou_thresh" in kwargs:
             forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
@@ -156,6 +158,9 @@ class MaskGenerationPipeline(ChunkPipeline):
                 the image length. Later layers with more crops scale down this overlap.
             crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
                 The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             `Dict`: A dictionary with the following keys:
@@ -175,8 +180,9 @@ class MaskGenerationPipeline(ChunkPipeline):
         crop_overlap_ratio: float = 512 / 1500,
         points_per_crop: Optional[int] = 32,
         crop_n_points_downscale_factor: Optional[int] = 1,
+        timeout: Optional[float] = None,
     ):
-        image = load_image(image)
+        image = load_image(image, timeout=timeout)
         target_size = self.image_processor.size["longest_edge"]
         crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
             image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index e7e26465ab..636a1b6a06 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -61,10 +61,13 @@ class ObjectDetectionPipeline(Pipeline):
         self.check_model_type(mapping)
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_kwargs = {}
         if "threshold" in kwargs:
             postprocess_kwargs["threshold"] = kwargs["threshold"]
-        return {}, {}, postprocess_kwargs
+        return preprocess_params, {}, postprocess_kwargs
 
     def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
@@ -82,6 +85,9 @@ class ObjectDetectionPipeline(Pipeline):
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
             threshold (`float`, *optional*, defaults to 0.9):
                 The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
@@ -97,8 +103,8 @@ class ObjectDetectionPipeline(Pipeline):
 
         return super().__call__(*args, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
         target_size = torch.IntTensor([[image.height, image.width]])
         inputs = self.image_processor(images=[image], return_tensors="pt")
         if self.tokenizer is not None:
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index ed36972d83..339a907cbb 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -55,12 +55,14 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         super().__init__(*args, **kwargs)
         self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
 
-    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, **kwargs):
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
         preprocess_params, postprocess_params = {}, {}
         if padding is not None:
             preprocess_params["padding"] = padding
         if truncation is not None:
             preprocess_params["truncation"] = truncation
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
         if top_k is not None:
             postprocess_params["top_k"] = top_k
         return preprocess_params, {}, postprocess_params
@@ -90,6 +92,9 @@ class VisualQuestionAnsweringPipeline(Pipeline):
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
         Return:
             A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
 
@@ -109,8 +114,8 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         results = super().__call__(inputs, **kwargs)
         return results
 
-    def preprocess(self, inputs, padding=False, truncation=False):
-        image = load_image(inputs["image"])
+    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
         model_inputs = self.tokenizer(
             inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
         )
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index a6e90dd522..b16d191754 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -91,6 +91,10 @@ class ZeroShotImageClassificationPipeline(Pipeline):
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                 logits_per_image
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
         Return:
             A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
             following keys:
@@ -104,13 +108,15 @@ class ZeroShotImageClassificationPipeline(Pipeline):
         preprocess_params = {}
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
 
         return preprocess_params, {}, {}
 
-    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."):
-        image = load_image(image)
+    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
+        image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index cd7009abfb..a7181d9540 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -111,6 +111,10 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
                 The number of top predictions that will be returned by the pipeline. If the provided number is `None`
                 or higher than the number of predictions available, it will default to the number of predictions.
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
 
         Return:
             A list of lists containing prediction results, one list per input image. Each list contains dictionaries
@@ -132,15 +136,18 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
         return results
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_params = {}
         if "threshold" in kwargs:
             postprocess_params["threshold"] = kwargs["threshold"]
         if "top_k" in kwargs:
             postprocess_params["top_k"] = kwargs["top_k"]
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
-    def preprocess(self, inputs):
-        image = load_image(inputs["image"])
+    def preprocess(self, inputs, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
         candidate_labels = inputs["candidate_labels"]
         if isinstance(candidate_labels, str):
             candidate_labels = candidate_labels.split(",")
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index dd816f5931..6d6c0f5d9b 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -18,7 +18,9 @@ import unittest
 import datasets
 import numpy as np
 import pytest
+from requests import ReadTimeout
 
+from tests.pipelines.test_pipelines_document_question_answering import INVOICE_URL
 from transformers import is_torch_available, is_vision_available
 from transformers.image_utils import ChannelDimension, get_channel_dimension_axis, make_list_of_images
 from transformers.testing_utils import require_torch, require_vision
@@ -478,6 +480,16 @@ class ImageFeatureExtractionTester(unittest.TestCase):
 
 @require_vision
 class LoadImageTester(unittest.TestCase):
+    def test_load_img_url(self):
+        img = load_image(INVOICE_URL)
+        img_arr = np.array(img)
+
+        self.assertEqual(img_arr.shape, (1061, 750, 3))
+
+    def test_load_img_url_timeout(self):
+        with self.assertRaises(ReadTimeout):
+            load_image(INVOICE_URL, timeout=0.001)
+
     def test_load_img_local(self):
         img = load_image("./tests/fixtures/tests_samples/COCO/000000039769.png")
         img_arr = np.array(img)