Object detection pipeline (#12886)

* Implement object-detection pipeline * Define threshold const * Add `threshold` argument * Refactor * Uncomment test inputs * `rm Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Fix typo Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Fix typo Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Chore better doc Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Rm unnecessary lines Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Chore better naming Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Fix typo * Add `detr-tiny` for tests * Add `ObjectDetectionPipeline` to `trnsfrmrs/init` * Implement new bbox format * Update detr post_process * Update `load_img` method obj det pipeline * make style * Implement new testing format for obj det pipeln * Add guard pytorch specific code in pipeline * Add doc * Make pipeline_obj_tet tests deterministic * Revert some changes to `post_process` COCO api * Chore * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/pipelines/object_detection.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Rm timm requirement * make fixup * Add timm requirement to test * Make fixup * Guard torch.Tensor * Chore * Delete unnecessary comment Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
2021-09-08 17:17:32 +02:00
parent 707105290b
commit 2a15e8ccfb
10 changed files with 475 additions and 0 deletions
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -29,6 +29,7 @@ There are two categories of pipeline abstractions to be aware about:
    - :class:`~transformers.FeatureExtractionPipeline`
    - :class:`~transformers.FillMaskPipeline`
    - :class:`~transformers.ImageClassificationPipeline`
    - :class:`~transformers.ObjectDetectionPipeline`
    - :class:`~transformers.QuestionAnsweringPipeline`
    - :class:`~transformers.SummarizationPipeline`
    - :class:`~transformers.TableQuestionAnsweringPipeline`
@@ -102,6 +103,13 @@ NerPipeline
 See :class:`~transformers.TokenClassificationPipeline` for all details.
 ObjectDetectionPipeline
 =======================================================================================================================
 .. autoclass:: transformers.ObjectDetectionPipeline
    :special-members: __call__
    :members:
 QuestionAnsweringPipeline
 =======================================================================================================================
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -142,6 +142,13 @@ AutoModelForAudioClassification
    :members:
 AutoModelForObjectDetection
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModelForObjectDetection
    :members:
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -294,6 +294,7 @@ _import_structure = {
        "ImageClassificationPipeline",
        "JsonPipelineDataFormat",
        "NerPipeline",
        "ObjectDetectionPipeline",
        "PipedPipelineDataFormat",
        "Pipeline",
        "PipelineDataFormat",
@@ -558,6 +559,7 @@ if is_torch_available():
            "AutoModelForMaskedLM",
            "AutoModelForMultipleChoice",
            "AutoModelForNextSentencePrediction",
            "AutoModelForObjectDetection",
            "AutoModelForPreTraining",
            "AutoModelForQuestionAnswering",
            "AutoModelForSeq2SeqLM",
@@ -2074,6 +2076,7 @@ if TYPE_CHECKING:
        ImageClassificationPipeline,
        JsonPipelineDataFormat,
        NerPipeline,
        ObjectDetectionPipeline,
        PipedPipelineDataFormat,
        Pipeline,
        PipelineDataFormat,
@@ -2295,6 +2298,7 @@ if TYPE_CHECKING:
            AutoModelForMaskedLM,
            AutoModelForMultipleChoice,
            AutoModelForNextSentencePrediction,
            AutoModelForObjectDetection,
            AutoModelForPreTraining,
            AutoModelForQuestionAnswering,
            AutoModelForSeq2SeqLM,
--- a/src/transformers/models/auto/init.py
+++ b/src/transformers/models/auto/init.py
@@ -52,6 +52,7 @@ if is_torch_available():
        "AutoModelForMaskedLM",
        "AutoModelForMultipleChoice",
        "AutoModelForNextSentencePrediction",
        "AutoModelForObjectDetection",
        "AutoModelForPreTraining",
        "AutoModelForQuestionAnswering",
        "AutoModelForSeq2SeqLM",
@@ -143,6 +144,7 @@ if TYPE_CHECKING:
            AutoModelForMaskedLM,
            AutoModelForMultipleChoice,
            AutoModelForNextSentencePrediction,
            AutoModelForObjectDetection,
            AutoModelForPreTraining,
            AutoModelForQuestionAnswering,
            AutoModelForSeq2SeqLM,
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -34,6 +34,7 @@ from .configuration_auto import (
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
    [
        ("beit", "BeitFeatureExtractor"),
        ("detr", "DetrFeatureExtractor"),
        ("deit", "DeiTFeatureExtractor"),
        ("hubert", "Wav2Vec2FeatureExtractor"),
        ("speech_to_text", "Speech2TextFeatureExtractor"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -588,6 +588,13 @@ class AutoModelForImageClassification(_BaseAutoModelClass):
 AutoModelForImageClassification = auto_class_update(AutoModelForImageClassification, head_doc="image classification")
 class AutoModelForObjectDetection(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 AutoModelForObjectDetection = auto_class_update(AutoModelForObjectDetection, head_doc="object detection")
 class AutoModelForAudioClassification(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@@ -44,6 +44,7 @@ from .conversational import Conversation, ConversationalPipeline
 from .feature_extraction import FeatureExtractionPipeline
 from .fill_mask import FillMaskPipeline
 from .image_classification import ImageClassificationPipeline
 from .object_detection import ObjectDetectionPipeline
 from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
 from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
 from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
@@ -91,6 +92,7 @@ if is_torch_available():
        AutoModelForCausalLM,
        AutoModelForImageClassification,
        AutoModelForMaskedLM,
        AutoModelForObjectDetection,
        AutoModelForQuestionAnswering,
        AutoModelForSeq2SeqLM,
        AutoModelForSequenceClassification,
@@ -229,6 +231,12 @@ SUPPORTED_TASKS = {
        "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
        "default": {"model": {"pt": "google/vit-base-patch16-224"}},
    },
    "object-detection": {
        "impl": ObjectDetectionPipeline,
        "tf": (),
        "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
        "default": {"model": {"pt": "facebook/detr-resnet-50"}},
    },
 }
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -0,0 +1,176 @@
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 import requests
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
 from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends
 from ..utils import logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 if TYPE_CHECKING:
    from ..modeling_utils import PreTrainedModel
 if is_vision_available():
    from PIL import Image
 if is_torch_available():
    import torch
    from ..models.auto.modeling_auto import MODEL_FOR_OBJECT_DETECTION_MAPPING
 logger = logging.get_logger(__name__)
 Prediction = Dict[str, Any]
 Predictions = List[Prediction]
@add_end_docstrings(PIPELINE_INIT_ARGS)
 class ObjectDetectionPipeline(Pipeline):
    """
    Object detection pipeline using any :obj:`AutoModelForObjectDetection`. This pipeline predicts bounding boxes of
    objects and their classes.
    This object detection pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
    identifier: :obj:`"object-detection"`.
    See the list of available models on `huggingface.co/models
    <https://huggingface.co/models?filter=object-detection>`__.
    """
    def __init__(
        self,
        model: "PreTrainedModel",
        feature_extractor: PreTrainedFeatureExtractor,
        framework: Optional[str] = None,
        **kwargs
    ):
        super().__init__(model, feature_extractor=feature_extractor, framework=framework, **kwargs)
        if self.framework == "tf":
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
        requires_backends(self, "vision")
        self.check_model_type(MODEL_FOR_OBJECT_DETECTION_MAPPING)
        self.feature_extractor = feature_extractor
    @staticmethod
    def load_image(image: Union[str, "Image.Image"]):
        if isinstance(image, str):
            if image.startswith("http://") or image.startswith("https://"):
                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
                # like http_huggingface_co.png
                image = Image.open(requests.get(image, stream=True).raw)
            elif os.path.isfile(image):
                image = Image.open(image)
            else:
                raise ValueError(
                    f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
                )
        elif isinstance(image, Image.Image):
            pass
        else:
            raise ValueError(
                "Incorrect format used for image. Should be a URL linking to an image, a local path, or a PIL image."
            )
        image = image.convert("RGB")
        return image
    def __call__(
        self,
        images: Union[str, List[str], "Image", List["Image"]],
        threshold: Optional[float] = 0.9,
    ) -> Union[Predictions, List[Prediction]]:
        """
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
        Args:
            images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`):
                The pipeline handles three types of images:
                - A string containing an HTTP(S) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly
                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
            threshold (:obj:`float`, `optional`, defaults to 0.9):
                The probability necessary to make a prediction.
        Return:
            A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
            image, will return a list of dictionaries, if the input is a list of several images, will return a list of
            list of dictionaries corresponding to each image.
            The dictionaries contain the following keys:
            - **label** (:obj:`str`) -- The class label identified by the model.
            - **score** (:obj:`float`) -- The score attributed by the model for that label.
            - **box** (:obj:`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
        """
        is_batched = isinstance(images, list)
        if not is_batched:
            images = [images]
        images = [self.load_image(image) for image in images]
        with torch.no_grad():
            inputs = self.feature_extractor(images=images, return_tensors="pt")
            outputs = self.model(**inputs)
            if self.framework == "pt":
                target_sizes = torch.IntTensor([[im.height, im.width] for im in images])
            else:
                raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
            raw_annotations = self.feature_extractor.post_process(outputs, target_sizes)
            annotations = []
            for annotation in raw_annotations:
                keep = annotation["scores"] > threshold
                scores = annotation["scores"][keep]
                labels = annotation["labels"][keep]
                boxes = annotation["boxes"][keep]
                annotation["scores"] = scores.tolist()
                annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
                annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
                # {"scores": [...], ...} --> [{"score":x, ...}, ...]
                keys = ["score", "label", "box"]
                annotation = [
                    dict(zip(keys, vals))
                    for vals in zip(annotation["scores"], annotation["labels"], annotation["boxes"])
                ]
                annotations.append(annotation)
        if not is_batched:
            return annotations[0]
        return annotations
    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
        """
        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
        Args:
            box (torch.Tensor): Tensor containing the coordinates in corners format.
        Returns:
            bbox (Dict[str, int]): Dict containing the coordinates in corners format.
        """
        if self.framework != "pt":
            raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
        xmin, ymin, xmax, ymax = box.int().tolist()
        bbox = {
            "xmin": xmin,
            "ymin": ymin,
            "xmax": xmax,
            "ymax": ymax,
        }
        return bbox
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -415,6 +415,15 @@ class AutoModelForNextSentencePrediction:
        requires_backends(cls, ["torch"])
 class AutoModelForObjectDetection:
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch"])
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        requires_backends(cls, ["torch"])
 class AutoModelForPreTraining:
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch"])
--- a/tests/test_pipelines_object_detection.py
+++ b/tests/test_pipelines_object_detection.py
@@ -0,0 +1,253 @@
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 from transformers import (
    MODEL_FOR_OBJECT_DETECTION_MAPPING,
    AutoFeatureExtractor,
    AutoModelForObjectDetection,
    ObjectDetectionPipeline,
    is_vision_available,
    pipeline,
 )
 from transformers.testing_utils import (
    is_pipeline_test,
    nested_simplify,
    require_datasets,
    require_tf,
    require_timm,
    require_torch,
    require_vision,
    slow,
 )
 from .test_pipelines_common import ANY, PipelineTestCaseMeta
 if is_vision_available():
    from PIL import Image
 else:
    class Image:
        @staticmethod
        def open(*args, **kwargs):
            pass
@require_vision
@require_timm
@require_torch
@is_pipeline_test
 class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
    model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
    @require_datasets
    def run_pipeline_test(self, model, tokenizer, feature_extractor):
        object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
        outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
        self.assertGreater(len(outputs), 0)
        for detected_object in outputs:
            self.assertEqual(
                detected_object,
                {
                    "score": ANY(float),
                    "label": ANY(str),
                    "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
                },
            )
        import datasets
        dataset = datasets.load_dataset("Narsil/image_dummy", "image", split="test")
        batch = [
            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
            "http://images.cocodataset.org/val2017/000000039769.jpg",
            # RGBA
            dataset[0]["file"],
            # LA
            dataset[1]["file"],
            # L
            dataset[2]["file"],
        ]
        batch_outputs = object_detector(batch, threshold=0.0)
        self.assertEqual(len(batch), len(batch_outputs))
        for outputs in batch_outputs:
            self.assertGreater(len(outputs), 0)
            for detected_object in outputs:
                self.assertEqual(
                    detected_object,
                    {
                        "score": ANY(float),
                        "label": ANY(str),
                        "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
                    },
                )
    @require_tf
    @unittest.skip("Object detection not implemented in TF")
    def test_small_model_tf(self):
        pass
    @require_torch
    def test_small_model_pt(self):
        model_id = "mishig/tiny-detr-mobilenetsv3"
        model = AutoModelForObjectDetection.from_pretrained(model_id)
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
        object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 266, "ymin": 200, "xmax": 799, "ymax": 599}},
                {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 266, "ymin": 200, "xmax": 799, "ymax": 599}},
            ],
        )
        outputs = object_detector(
            [
                "http://images.cocodataset.org/val2017/000000039769.jpg",
                "http://images.cocodataset.org/val2017/000000039769.jpg",
            ],
            threshold=0.0,
        )
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                [
                    {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 266, "ymin": 200, "xmax": 799, "ymax": 599}},
                    {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 266, "ymin": 200, "xmax": 799, "ymax": 599}},
                ],
                [
                    {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 266, "ymin": 200, "xmax": 799, "ymax": 599}},
                    {"score": 0.3432, "label": "LABEL_0", "box": {"xmin": 266, "ymin": 200, "xmax": 799, "ymax": 599}},
                ],
            ],
        )
    @require_torch
    @slow
    def test_large_model_pt(self):
        model_id = "facebook/detr-resnet-50"
        model = AutoModelForObjectDetection.from_pretrained(model_id)
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
        object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg")
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                {"score": 0.9982, "label": "remote", "box": {"xmin": 66, "ymin": 118, "xmax": 292, "ymax": 196}},
                {"score": 0.9960, "label": "remote", "box": {"xmin": 555, "ymin": 120, "xmax": 613, "ymax": 312}},
                {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 1065, "ymax": 789}},
                {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
            ],
        )
        outputs = object_detector(
            [
                "http://images.cocodataset.org/val2017/000000039769.jpg",
                "http://images.cocodataset.org/val2017/000000039769.jpg",
            ]
        )
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                [
                    {"score": 0.9982, "label": "remote", "box": {"xmin": 66, "ymin": 118, "xmax": 292, "ymax": 196}},
                    {"score": 0.9960, "label": "remote", "box": {"xmin": 555, "ymin": 120, "xmax": 613, "ymax": 312}},
                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 1065, "ymax": 789}},
                    {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                    {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
                ],
                [
                    {"score": 0.9982, "label": "remote", "box": {"xmin": 66, "ymin": 118, "xmax": 292, "ymax": 196}},
                    {"score": 0.9960, "label": "remote", "box": {"xmin": 555, "ymin": 120, "xmax": 613, "ymax": 312}},
                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 1065, "ymax": 789}},
                    {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                    {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
                ],
            ],
        )
    @require_torch
    @slow
    def test_integration_torch_object_detection(self):
        model_id = "facebook/detr-resnet-50"
        object_detector = pipeline("object-detection", model=model_id)
        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg")
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                {"score": 0.9982, "label": "remote", "box": {"xmin": 66, "ymin": 118, "xmax": 292, "ymax": 196}},
                {"score": 0.9960, "label": "remote", "box": {"xmin": 555, "ymin": 120, "xmax": 613, "ymax": 312}},
                {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 1065, "ymax": 789}},
                {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
            ],
        )
        outputs = object_detector(
            [
                "http://images.cocodataset.org/val2017/000000039769.jpg",
                "http://images.cocodataset.org/val2017/000000039769.jpg",
            ]
        )
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                [
                    {"score": 0.9982, "label": "remote", "box": {"xmin": 66, "ymin": 118, "xmax": 292, "ymax": 196}},
                    {"score": 0.9960, "label": "remote", "box": {"xmin": 555, "ymin": 120, "xmax": 613, "ymax": 312}},
                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 1065, "ymax": 789}},
                    {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                    {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
                ],
                [
                    {"score": 0.9982, "label": "remote", "box": {"xmin": 66, "ymin": 118, "xmax": 292, "ymax": 196}},
                    {"score": 0.9960, "label": "remote", "box": {"xmin": 555, "ymin": 120, "xmax": 613, "ymax": 312}},
                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 1065, "ymax": 789}},
                    {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                    {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
                ],
            ],
        )
    @require_torch
    @slow
    def test_threshold(self):
        threshold = 0.9985
        model_id = "facebook/detr-resnet-50"
        object_detector = pipeline("object-detection", model=model_id)
        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
                {"score": 0.9988, "label": "cat", "box": {"xmin": 22, "ymin": 86, "xmax": 523, "ymax": 784}},
                {"score": 0.9987, "label": "cat", "box": {"xmin": 575, "ymin": 39, "xmax": 1066, "ymax": 614}},
            ],
        )