Fix some pipeline tests (#21401)
* fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -387,8 +387,11 @@ for task, values in SUPPORTED_TASKS.items():
|
||||
if values["type"] == "text":
|
||||
NO_FEATURE_EXTRACTOR_TASKS.add(task)
|
||||
NO_IMAGE_PROCESSOR_TASKS.add(task)
|
||||
elif values["type"] in {"audio", "image", "video"}:
|
||||
elif values["type"] in {"image", "video"}:
|
||||
NO_TOKENIZER_TASKS.add(task)
|
||||
elif values["type"] in {"audio"}:
|
||||
NO_TOKENIZER_TASKS.add(task)
|
||||
NO_IMAGE_PROCESSOR_TASKS.add(task)
|
||||
elif values["type"] != "multimodal":
|
||||
raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
|
||||
|
||||
@@ -773,6 +776,14 @@ def pipeline(
|
||||
load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
|
||||
load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
|
||||
|
||||
# If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
|
||||
# `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
|
||||
# vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
|
||||
# TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
|
||||
# This block is only temporarily to make CI green.
|
||||
if load_image_processor and load_feature_extractor:
|
||||
load_feature_extractor = False
|
||||
|
||||
if (
|
||||
tokenizer is None
|
||||
and not load_tokenizer
|
||||
@@ -784,6 +795,18 @@ def pipeline(
|
||||
# so the model_config might not define a tokenizer, but it seems to be
|
||||
# necessary for the task, so we're force-trying to load it.
|
||||
load_tokenizer = True
|
||||
if (
|
||||
image_processor is None
|
||||
and not load_image_processor
|
||||
and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
|
||||
# Using class name to avoid importing the real class.
|
||||
and model_config.__class__.__name__ in MULTI_MODEL_CONFIGS
|
||||
and normalized_task != "automatic-speech-recognition"
|
||||
):
|
||||
# This is a special category of models, that are fusions of multiple models
|
||||
# so the model_config might not define a tokenizer, but it seems to be
|
||||
# necessary for the task, so we're force-trying to load it.
|
||||
load_image_processor = True
|
||||
if (
|
||||
feature_extractor is None
|
||||
and not load_feature_extractor
|
||||
|
||||
@@ -77,7 +77,7 @@ def _pad(items, key, padding_value, padding_side):
|
||||
# Others include `attention_mask` etc...
|
||||
shape = items[0][key].shape
|
||||
dim = len(shape)
|
||||
if key == "pixel_values":
|
||||
if key in ["pixel_values", "image"]:
|
||||
# This is probable image so padding shouldn't be necessary
|
||||
# B, C, H, W
|
||||
return torch.cat([item[key] for item in items], dim=0)
|
||||
@@ -792,6 +792,13 @@ class Pipeline(_ScikitCompat):
|
||||
self._num_workers = kwargs.pop("num_workers", None)
|
||||
self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
|
||||
|
||||
if self.image_processor is None and self.feature_extractor is not None:
|
||||
if isinstance(self.feature_extractor, BaseImageProcessor):
|
||||
# Backward compatible change, if users called
|
||||
# ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
|
||||
# then we should keep working
|
||||
self.image_processor = self.feature_extractor
|
||||
|
||||
def save_pretrained(self, save_directory: str):
|
||||
"""
|
||||
Save the pipeline's model and tokenizer.
|
||||
|
||||
@@ -87,7 +87,7 @@ class DepthEstimationPipeline(Pipeline):
|
||||
def preprocess(self, image):
|
||||
image = load_image(image)
|
||||
self.image_size = image.size
|
||||
model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
|
||||
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
|
||||
return model_inputs
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
|
||||
@@ -281,7 +281,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
|
||||
image_features = {}
|
||||
if input.get("image", None) is not None:
|
||||
image = load_image(input["image"])
|
||||
if self.feature_extractor is not None:
|
||||
if self.image_processor is not None:
|
||||
image_features.update(self.image_processor(images=image, return_tensors=self.framework))
|
||||
elif self.feature_extractor is not None:
|
||||
image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
|
||||
elif self.model_type == ModelType.VisionEncoderDecoder:
|
||||
raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
|
||||
@@ -352,7 +354,9 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
|
||||
return_overflowing_tokens=True,
|
||||
**tokenizer_kwargs,
|
||||
)
|
||||
encoding.pop("overflow_to_sample_mapping") # We do not use this
|
||||
# TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
|
||||
# FIXME: ydshieh and/or Narsil
|
||||
encoding.pop("overflow_to_sample_mapping", None) # We do not use this
|
||||
|
||||
num_spans = len(encoding["input_ids"])
|
||||
|
||||
|
||||
@@ -101,7 +101,7 @@ class ImageClassificationPipeline(Pipeline):
|
||||
|
||||
def preprocess(self, image):
|
||||
image = load_image(image)
|
||||
model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
|
||||
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
|
||||
return model_inputs
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
|
||||
@@ -67,12 +67,6 @@ class ImageSegmentationPipeline(Pipeline):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if self.image_processor is None and self.feature_extractor is not None:
|
||||
# Backward compatible change, if users called
|
||||
# ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
|
||||
# then we should keep working
|
||||
self.image_processor = self.feature_extractor
|
||||
|
||||
if self.framework == "tf":
|
||||
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ class ImageToTextPipeline(Pipeline):
|
||||
|
||||
def preprocess(self, image):
|
||||
image = load_image(image)
|
||||
model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
|
||||
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
|
||||
return model_inputs
|
||||
|
||||
def _forward(self, model_inputs, generate_kwargs=None):
|
||||
|
||||
@@ -97,7 +97,7 @@ class ObjectDetectionPipeline(Pipeline):
|
||||
def preprocess(self, image):
|
||||
image = load_image(image)
|
||||
target_size = torch.IntTensor([[image.height, image.width]])
|
||||
inputs = self.feature_extractor(images=[image], return_tensors="pt")
|
||||
inputs = self.image_processor(images=[image], return_tensors="pt")
|
||||
if self.tokenizer is not None:
|
||||
inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
|
||||
inputs["target_size"] = target_size
|
||||
@@ -137,9 +137,7 @@ class ObjectDetectionPipeline(Pipeline):
|
||||
annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
|
||||
else:
|
||||
# This is a regular ForObjectDetectionModel
|
||||
raw_annotations = self.feature_extractor.post_process_object_detection(
|
||||
model_outputs, threshold, target_size
|
||||
)
|
||||
raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
|
||||
raw_annotation = raw_annotations[0]
|
||||
scores = raw_annotation["scores"]
|
||||
labels = raw_annotation["labels"]
|
||||
|
||||
@@ -102,7 +102,7 @@ class VideoClassificationPipeline(Pipeline):
|
||||
video = videoreader.get_batch(indices).asnumpy()
|
||||
video = list(video)
|
||||
|
||||
model_inputs = self.feature_extractor(video, return_tensors=self.framework)
|
||||
model_inputs = self.image_processor(video, return_tensors=self.framework)
|
||||
return model_inputs
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
|
||||
@@ -114,7 +114,7 @@ class VisualQuestionAnsweringPipeline(Pipeline):
|
||||
model_inputs = self.tokenizer(
|
||||
inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
|
||||
)
|
||||
image_features = self.feature_extractor(images=image, return_tensors=self.framework)
|
||||
image_features = self.image_processor(images=image, return_tensors=self.framework)
|
||||
model_inputs.update(image_features)
|
||||
return model_inputs
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ class ZeroShotImageClassificationPipeline(ChunkPipeline):
|
||||
n = len(candidate_labels)
|
||||
for i, candidate_label in enumerate(candidate_labels):
|
||||
image = load_image(image)
|
||||
images = self.feature_extractor(images=[image], return_tensors=self.framework)
|
||||
images = self.image_processor(images=[image], return_tensors=self.framework)
|
||||
sequence = hypothesis_template.format(candidate_label)
|
||||
inputs = self.tokenizer(sequence, return_tensors=self.framework)
|
||||
inputs["pixel_values"] = images.pixel_values
|
||||
|
||||
@@ -148,7 +148,7 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
|
||||
target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
|
||||
for i, candidate_label in enumerate(candidate_labels):
|
||||
text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
|
||||
image_features = self.feature_extractor(image, return_tensors=self.framework)
|
||||
image_features = self.image_processor(image, return_tensors=self.framework)
|
||||
yield {
|
||||
"is_last": i == len(candidate_labels) - 1,
|
||||
"target_size": target_size,
|
||||
@@ -173,7 +173,7 @@ class ZeroShotObjectDetectionPipeline(ChunkPipeline):
|
||||
for model_output in model_outputs:
|
||||
label = model_output["candidate_label"]
|
||||
model_output = BaseModelOutput(model_output)
|
||||
outputs = self.feature_extractor.post_process_object_detection(
|
||||
outputs = self.image_processor.post_process_object_detection(
|
||||
outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
|
||||
)[0]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user