Fix some pipeline tests (#21401)

* fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2023-02-02 19:03:31 +01:00
committed by GitHub
parent 145bf41c13
commit a6d8a149a8
20 changed files with 69 additions and 37 deletions

View File

@@ -387,8 +387,11 @@ for task, values in SUPPORTED_TASKS.items():
if values["type"] == "text":
NO_FEATURE_EXTRACTOR_TASKS.add(task)
NO_IMAGE_PROCESSOR_TASKS.add(task)
elif values["type"] in {"audio", "image", "video"}:
elif values["type"] in {"image", "video"}:
NO_TOKENIZER_TASKS.add(task)
elif values["type"] in {"audio"}:
NO_TOKENIZER_TASKS.add(task)
NO_IMAGE_PROCESSOR_TASKS.add(task)
elif values["type"] != "multimodal":
raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
@@ -773,6 +776,14 @@ def pipeline(
load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
# If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
# `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
# vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
# TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
# This block is only temporarily to make CI green.
if load_image_processor and load_feature_extractor:
load_feature_extractor = False
if (
tokenizer is None
and not load_tokenizer
@@ -784,6 +795,18 @@ def pipeline(
# so the model_config might not define a tokenizer, but it seems to be
# necessary for the task, so we're force-trying to load it.
load_tokenizer = True
if (
image_processor is None
and not load_image_processor
and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
# Using class name to avoid importing the real class.
and model_config.__class__.__name__ in MULTI_MODEL_CONFIGS
and normalized_task != "automatic-speech-recognition"
):
# This is a special category of models, that are fusions of multiple models
# so the model_config might not define a tokenizer, but it seems to be
# necessary for the task, so we're force-trying to load it.
load_image_processor = True
if (
feature_extractor is None
and not load_feature_extractor

View File

@@ -77,7 +77,7 @@ def _pad(items, key, padding_value, padding_side):
# Others include `attention_mask` etc...
shape = items[0][key].shape
dim = len(shape)
if key == "pixel_values":
if key in ["pixel_values", "image"]:
# This is probable image so padding shouldn't be necessary
# B, C, H, W
return torch.cat([item[key] for item in items], dim=0)
@@ -792,6 +792,13 @@ class Pipeline(_ScikitCompat):
self._num_workers = kwargs.pop("num_workers", None)
self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
if self.image_processor is None and self.feature_extractor is not None:
if isinstance(self.feature_extractor, BaseImageProcessor):
# Backward compatible change, if users called
# ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
# then we should keep working
self.image_processor = self.feature_extractor
def save_pretrained(self, save_directory: str):
"""
Save the pipeline's model and tokenizer.

View File

@@ -87,7 +87,7 @@ class DepthEstimationPipeline(Pipeline):
def preprocess(self, image):
image = load_image(image)
self.image_size = image.size
model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
return model_inputs
def _forward(self, model_inputs):

View File

@@ -281,7 +281,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
image_features = {}
if input.get("image", None) is not None:
image = load_image(input["image"])
if self.feature_extractor is not None:
if self.image_processor is not None:
image_features.update(self.image_processor(images=image, return_tensors=self.framework))
elif self.feature_extractor is not None:
image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
elif self.model_type == ModelType.VisionEncoderDecoder:
raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
@@ -352,7 +354,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
return_overflowing_tokens=True,
**tokenizer_kwargs,
)
encoding.pop("overflow_to_sample_mapping") # We do not use this
# TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
# FIXME: ydshieh and/or Narsil
encoding.pop("overflow_to_sample_mapping", None) # We do not use this
num_spans = len(encoding["input_ids"])

View File

@@ -101,7 +101,7 @@ class ImageClassificationPipeline(Pipeline):
def preprocess(self, image):
image = load_image(image)
model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
return model_inputs
def _forward(self, model_inputs):

View File

@@ -67,12 +67,6 @@ class ImageSegmentationPipeline(Pipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.image_processor is None and self.feature_extractor is not None:
# Backward compatible change, if users called
# ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
# then we should keep working
self.image_processor = self.feature_extractor
if self.framework == "tf":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")

View File

@@ -100,7 +100,7 @@ class ImageToTextPipeline(Pipeline):
def preprocess(self, image):
image = load_image(image)
model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
return model_inputs
def _forward(self, model_inputs, generate_kwargs=None):

View File

@@ -97,7 +97,7 @@ class ObjectDetectionPipeline(Pipeline):
def preprocess(self, image):
image = load_image(image)
target_size = torch.IntTensor([[image.height, image.width]])
inputs = self.feature_extractor(images=[image], return_tensors="pt")
inputs = self.image_processor(images=[image], return_tensors="pt")
if self.tokenizer is not None:
inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
inputs["target_size"] = target_size
@@ -137,9 +137,7 @@ class ObjectDetectionPipeline(Pipeline):
annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
else:
# This is a regular ForObjectDetectionModel
raw_annotations = self.feature_extractor.post_process_object_detection(
model_outputs, threshold, target_size
)
raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
raw_annotation = raw_annotations[0]
scores = raw_annotation["scores"]
labels = raw_annotation["labels"]

View File

@@ -102,7 +102,7 @@ class VideoClassificationPipeline(Pipeline):
video = videoreader.get_batch(indices).asnumpy()
video = list(video)
model_inputs = self.feature_extractor(video, return_tensors=self.framework)
model_inputs = self.image_processor(video, return_tensors=self.framework)
return model_inputs
def _forward(self, model_inputs):

View File

@@ -114,7 +114,7 @@ class VisualQuestionAnsweringPipeline(Pipeline):
model_inputs = self.tokenizer(
inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
)
image_features = self.feature_extractor(images=image, return_tensors=self.framework)
image_features = self.image_processor(images=image, return_tensors=self.framework)
model_inputs.update(image_features)
return model_inputs

View File

@@ -110,7 +110,7 @@ class ZeroShotImageClassificationPipeline(ChunkPipeline):
n = len(candidate_labels)
for i, candidate_label in enumerate(candidate_labels):
image = load_image(image)
images = self.feature_extractor(images=[image], return_tensors=self.framework)
images = self.image_processor(images=[image], return_tensors=self.framework)
sequence = hypothesis_template.format(candidate_label)
inputs = self.tokenizer(sequence, return_tensors=self.framework)
inputs["pixel_values"] = images.pixel_values

View File

@@ -148,7 +148,7 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
for i, candidate_label in enumerate(candidate_labels):
text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
image_features = self.feature_extractor(image, return_tensors=self.framework)
image_features = self.image_processor(image, return_tensors=self.framework)
yield {
"is_last": i == len(candidate_labels) - 1,
"target_size": target_size,
@@ -173,7 +173,7 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
for model_output in model_outputs:
label = model_output["candidate_label"]
model_output = BaseModelOutput(model_output)
outputs = self.feature_extractor.post_process_object_detection(
outputs = self.image_processor.post_process_object_detection(
outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
)[0]