Fix some pipeline tests (#21401)

* fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2023-02-02 19:03:31 +01:00
parent 145bf41c13
commit a6d8a149a8
20 changed files with 69 additions and 37 deletions
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@@ -387,8 +387,11 @@ for task, values in SUPPORTED_TASKS.items():
    if values["type"] == "text":
        NO_FEATURE_EXTRACTOR_TASKS.add(task)
        NO_IMAGE_PROCESSOR_TASKS.add(task)
-    elif values["type"] in {"audio", "image", "video"}:
+    elif values["type"] in {"image", "video"}:
        NO_TOKENIZER_TASKS.add(task)
+    elif values["type"] in {"audio"}:
+        NO_TOKENIZER_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
    elif values["type"] != "multimodal":
        raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")

@@ -773,6 +776,14 @@ def pipeline(
    load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None

+    # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
+    # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
+    # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
+    # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
+    # This block is only temporarily to make CI green.
+    if load_image_processor and load_feature_extractor:
+        load_feature_extractor = False
+
    if (
        tokenizer is None
        and not load_tokenizer
@@ -784,6 +795,18 @@ def pipeline(
        # so the model_config might not define a tokenizer, but it seems to be
        # necessary for the task, so we're force-trying to load it.
        load_tokenizer = True
+    if (
+        image_processor is None
+        and not load_image_processor
+        and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_CONFIGS
+        and normalized_task != "automatic-speech-recognition"
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_image_processor = True
    if (
        feature_extractor is None
        and not load_feature_extractor
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -77,7 +77,7 @@ def _pad(items, key, padding_value, padding_side):
        # Others include `attention_mask` etc...
        shape = items[0][key].shape
        dim = len(shape)
-        if key == "pixel_values":
+        if key in ["pixel_values", "image"]:
            # This is probable image so padding shouldn't be necessary
            # B, C, H, W
            return torch.cat([item[key] for item in items], dim=0)
@@ -792,6 +792,13 @@ class Pipeline(_ScikitCompat):
        self._num_workers = kwargs.pop("num_workers", None)
        self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)

+        if self.image_processor is None and self.feature_extractor is not None:
+            if isinstance(self.feature_extractor, BaseImageProcessor):
+                # Backward compatible change, if users called
+                # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+                # then we should keep working
+                self.image_processor = self.feature_extractor
+
    def save_pretrained(self, save_directory: str):
        """
        Save the pipeline's model and tokenizer.
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -87,7 +87,7 @@ class DepthEstimationPipeline(Pipeline):
    def preprocess(self, image):
        image = load_image(image)
        self.image_size = image.size
-        model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
        return model_inputs

    def _forward(self, model_inputs):
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -281,7 +281,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
        image_features = {}
        if input.get("image", None) is not None:
            image = load_image(input["image"])
-            if self.feature_extractor is not None:
+            if self.image_processor is not None:
+                image_features.update(self.image_processor(images=image, return_tensors=self.framework))
+            elif self.feature_extractor is not None:
                image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
            elif self.model_type == ModelType.VisionEncoderDecoder:
                raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
@@ -352,7 +354,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
                return_overflowing_tokens=True,
                **tokenizer_kwargs,
            )
-            encoding.pop("overflow_to_sample_mapping")  # We do not use this
+            # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
+            # FIXME: ydshieh and/or Narsil
+            encoding.pop("overflow_to_sample_mapping", None)  # We do not use this

            num_spans = len(encoding["input_ids"])

--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -101,7 +101,7 @@ class ImageClassificationPipeline(Pipeline):

    def preprocess(self, image):
        image = load_image(image)
-        model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
        return model_inputs

    def _forward(self, model_inputs):
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -67,12 +67,6 @@ class ImageSegmentationPipeline(Pipeline):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

-        if self.image_processor is None and self.feature_extractor is not None:
-            # Backward compatible change, if users called
-            # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
-            # then we should keep working
-            self.image_processor = self.feature_extractor
-
        if self.framework == "tf":
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")

--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -100,7 +100,7 @@ class ImageToTextPipeline(Pipeline):

    def preprocess(self, image):
        image = load_image(image)
-        model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
        return model_inputs

    def _forward(self, model_inputs, generate_kwargs=None):
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -97,7 +97,7 @@ class ObjectDetectionPipeline(Pipeline):
    def preprocess(self, image):
        image = load_image(image)
        target_size = torch.IntTensor([[image.height, image.width]])
-        inputs = self.feature_extractor(images=[image], return_tensors="pt")
+        inputs = self.image_processor(images=[image], return_tensors="pt")
        if self.tokenizer is not None:
            inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
        inputs["target_size"] = target_size
@@ -137,9 +137,7 @@ class ObjectDetectionPipeline(Pipeline):
            annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
        else:
            # This is a regular ForObjectDetectionModel
-            raw_annotations = self.feature_extractor.post_process_object_detection(
-                model_outputs, threshold, target_size
-            )
+            raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
            raw_annotation = raw_annotations[0]
            scores = raw_annotation["scores"]
            labels = raw_annotation["labels"]
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -102,7 +102,7 @@ class VideoClassificationPipeline(Pipeline):
        video = videoreader.get_batch(indices).asnumpy()
        video = list(video)

-        model_inputs = self.feature_extractor(video, return_tensors=self.framework)
+        model_inputs = self.image_processor(video, return_tensors=self.framework)
        return model_inputs

    def _forward(self, model_inputs):
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -114,7 +114,7 @@ class VisualQuestionAnsweringPipeline(Pipeline):
        model_inputs = self.tokenizer(
            inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
        )
-        image_features = self.feature_extractor(images=image, return_tensors=self.framework)
+        image_features = self.image_processor(images=image, return_tensors=self.framework)
        model_inputs.update(image_features)
        return model_inputs

--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -110,7 +110,7 @@ class ZeroShotImageClassificationPipeline(ChunkPipeline):
        n = len(candidate_labels)
        for i, candidate_label in enumerate(candidate_labels):
            image = load_image(image)
-            images = self.feature_extractor(images=[image], return_tensors=self.framework)
+            images = self.image_processor(images=[image], return_tensors=self.framework)
            sequence = hypothesis_template.format(candidate_label)
            inputs = self.tokenizer(sequence, return_tensors=self.framework)
            inputs["pixel_values"] = images.pixel_values
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -148,7 +148,7 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
        target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
        for i, candidate_label in enumerate(candidate_labels):
            text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
-            image_features = self.feature_extractor(image, return_tensors=self.framework)
+            image_features = self.image_processor(image, return_tensors=self.framework)
            yield {
                "is_last": i == len(candidate_labels) - 1,
                "target_size": target_size,
@@ -173,7 +173,7 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
        for model_output in model_outputs:
            label = model_output["candidate_label"]
            model_output = BaseModelOutput(model_output)
-            outputs = self.feature_extractor.post_process_object_detection(
+            outputs = self.image_processor.post_process_object_detection(
                outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
            )[0]