Fast image processor (#28847)
* Draft fast image processors * Draft working fast version * py3.8 compatible cache * Enable loading fast image processors through auto * Tidy up; rescale behaviour based on input type * Enable tests for fast image processors * Smarter rescaling * Don't default to Fast * Safer imports * Add necessary Pillow requirement * Woops * Add AutoImageProcessor test * Fix up * Fix test for imagegpt * Fix test * Review comments * Add warning for TF and JAX input types * Rearrange * Return transforms * NumpyToTensor transformation * Rebase - include changes from upstream in ImageProcessingMixin * Safe typing * Fix up * convert mean/std to tesnor to rescale * Don't store transforms in state * Fix up * Update src/transformers/image_processing_utils_fast.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Warn if fast image processor available * Update src/transformers/models/vit/image_processing_vit_fast.py * Transpose incoming numpy images to be in CHW format * Update mapping names based on packages, auto set fast to None * Fix up * Fix * Add AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) test * Update src/transformers/models/vit/image_processing_vit_fast.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Add equivalence and speed tests * Fix up --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
@@ -32,3 +32,8 @@ An image processor is in charge of preparing input features for vision models an
|
|||||||
## BaseImageProcessor
|
## BaseImageProcessor
|
||||||
|
|
||||||
[[autodoc]] image_processing_utils.BaseImageProcessor
|
[[autodoc]] image_processing_utils.BaseImageProcessor
|
||||||
|
|
||||||
|
|
||||||
|
## BaseImageProcessorFast
|
||||||
|
|
||||||
|
[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ Following the original Vision Transformer, some follow-up works have been made:
|
|||||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
|
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
|
||||||
found [here](https://github.com/google-research/vision_transformer).
|
found [here](https://github.com/google-research/vision_transformer).
|
||||||
|
|
||||||
Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
|
Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
|
||||||
who already converted the weights from JAX to PyTorch. Credits go to him!
|
who already converted the weights from JAX to PyTorch. Credits go to him!
|
||||||
|
|
||||||
## Usage tips
|
## Usage tips
|
||||||
@@ -158,6 +158,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
|||||||
[[autodoc]] ViTImageProcessor
|
[[autodoc]] ViTImageProcessor
|
||||||
- preprocess
|
- preprocess
|
||||||
|
|
||||||
|
## ViTImageProcessorFast
|
||||||
|
|
||||||
|
[[autodoc]] ViTImageProcessorFast
|
||||||
|
- preprocess
|
||||||
|
|
||||||
<frameworkcontent>
|
<frameworkcontent>
|
||||||
<pt>
|
<pt>
|
||||||
|
|
||||||
|
|||||||
@@ -29,3 +29,4 @@ timm
|
|||||||
albumentations >= 1.4.5
|
albumentations >= 1.4.5
|
||||||
torchmetrics
|
torchmetrics
|
||||||
pycocotools
|
pycocotools
|
||||||
|
Pillow>=10.0.1,<=15.0
|
||||||
|
|||||||
@@ -1104,7 +1104,8 @@ except OptionalDependencyNotAvailable:
|
|||||||
name for name in dir(dummy_vision_objects) if not name.startswith("_")
|
name for name in dir(dummy_vision_objects) if not name.startswith("_")
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
_import_structure["image_processing_utils"] = ["ImageProcessingMixin"]
|
_import_structure["image_processing_base"] = ["ImageProcessingMixin"]
|
||||||
|
_import_structure["image_processing_utils"] = ["BaseImageProcessor"]
|
||||||
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
|
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
|
||||||
_import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
|
_import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
|
||||||
_import_structure["models.bit"].extend(["BitImageProcessor"])
|
_import_structure["models.bit"].extend(["BitImageProcessor"])
|
||||||
@@ -1167,6 +1168,18 @@ else:
|
|||||||
_import_structure["models.vivit"].append("VivitImageProcessor")
|
_import_structure["models.vivit"].append("VivitImageProcessor")
|
||||||
_import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
|
_import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not is_torchvision_available():
|
||||||
|
raise OptionalDependencyNotAvailable()
|
||||||
|
except OptionalDependencyNotAvailable:
|
||||||
|
from .utils import dummy_torchvision_objects
|
||||||
|
|
||||||
|
_import_structure["utils.dummy_torchvision_objects"] = [
|
||||||
|
name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
_import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
|
||||||
|
_import_structure["models.vit"].append("ViTImageProcessorFast")
|
||||||
|
|
||||||
# PyTorch-backed objects
|
# PyTorch-backed objects
|
||||||
try:
|
try:
|
||||||
@@ -5703,7 +5716,8 @@ if TYPE_CHECKING:
|
|||||||
except OptionalDependencyNotAvailable:
|
except OptionalDependencyNotAvailable:
|
||||||
from .utils.dummy_vision_objects import *
|
from .utils.dummy_vision_objects import *
|
||||||
else:
|
else:
|
||||||
from .image_processing_utils import ImageProcessingMixin
|
from .image_processing_base import ImageProcessingMixin
|
||||||
|
from .image_processing_utils import BaseImageProcessor
|
||||||
from .image_utils import ImageFeatureExtractionMixin
|
from .image_utils import ImageFeatureExtractionMixin
|
||||||
from .models.beit import BeitFeatureExtractor, BeitImageProcessor
|
from .models.beit import BeitFeatureExtractor, BeitImageProcessor
|
||||||
from .models.bit import BitImageProcessor
|
from .models.bit import BitImageProcessor
|
||||||
@@ -5793,6 +5807,15 @@ if TYPE_CHECKING:
|
|||||||
from .models.vivit import VivitImageProcessor
|
from .models.vivit import VivitImageProcessor
|
||||||
from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
|
from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not is_torchvision_available():
|
||||||
|
raise OptionalDependencyNotAvailable()
|
||||||
|
except OptionalDependencyNotAvailable:
|
||||||
|
from .utils.dummy_torchvision_objects import *
|
||||||
|
else:
|
||||||
|
from .image_processing_utils_fast import BaseImageProcessorFast
|
||||||
|
from .models.vit import ViTImageProcessorFast
|
||||||
|
|
||||||
# Modeling
|
# Modeling
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
|
|||||||
554
src/transformers/image_processing_base.py
Normal file
554
src/transformers/image_processing_base.py
Normal file
@@ -0,0 +1,554 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2020 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from .dynamic_module_utils import custom_object_save
|
||||||
|
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
|
||||||
|
from .utils import (
|
||||||
|
IMAGE_PROCESSOR_NAME,
|
||||||
|
PushToHubMixin,
|
||||||
|
add_model_info_to_auto_map,
|
||||||
|
add_model_info_to_custom_pipelines,
|
||||||
|
cached_file,
|
||||||
|
copy_func,
|
||||||
|
download_url,
|
||||||
|
is_offline_mode,
|
||||||
|
is_remote_url,
|
||||||
|
is_vision_available,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
|
||||||
|
# We override the class string here, but logic is the same.
|
||||||
|
class BatchFeature(BaseBatchFeature):
|
||||||
|
r"""
|
||||||
|
Holds the output of the image processor specific `__call__` methods.
|
||||||
|
|
||||||
|
This class is derived from a python dictionary and can be used as a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (`dict`):
|
||||||
|
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
|
||||||
|
tensor_type (`Union[None, str, TensorType]`, *optional*):
|
||||||
|
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
|
||||||
|
initialization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: (Amy) - factor out the common parts of this and the feature extractor
|
||||||
|
class ImageProcessingMixin(PushToHubMixin):
|
||||||
|
"""
|
||||||
|
This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
|
||||||
|
extractors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_auto_class = None
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
"""Set elements of `kwargs` as attributes."""
|
||||||
|
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
|
||||||
|
# `XXXImageProcessor`, this attribute and its value are misleading.
|
||||||
|
kwargs.pop("feature_extractor_type", None)
|
||||||
|
# Pop "processor_class" as it should be saved as private attribute
|
||||||
|
self._processor_class = kwargs.pop("processor_class", None)
|
||||||
|
# Additional attributes without default values
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
try:
|
||||||
|
setattr(self, key, value)
|
||||||
|
except AttributeError as err:
|
||||||
|
logger.error(f"Can't set {key} with value {value} for {self}")
|
||||||
|
raise err
|
||||||
|
|
||||||
|
def _set_processor_class(self, processor_class: str):
|
||||||
|
"""Sets processor class as an attribute."""
|
||||||
|
self._processor_class = processor_class
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(
|
||||||
|
cls,
|
||||||
|
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||||
|
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||||
|
force_download: bool = False,
|
||||||
|
local_files_only: bool = False,
|
||||||
|
token: Optional[Union[str, bool]] = None,
|
||||||
|
revision: str = "main",
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
r"""
|
||||||
|
Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||||
|
This can be either:
|
||||||
|
|
||||||
|
- a string, the *model id* of a pretrained image_processor hosted inside a model repo on
|
||||||
|
huggingface.co.
|
||||||
|
- a path to a *directory* containing a image processor file saved using the
|
||||||
|
[`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
|
||||||
|
`./my_model_directory/`.
|
||||||
|
- a path or url to a saved image processor JSON *file*, e.g.,
|
||||||
|
`./my_model_directory/preprocessor_config.json`.
|
||||||
|
cache_dir (`str` or `os.PathLike`, *optional*):
|
||||||
|
Path to a directory in which a downloaded pretrained model image processor should be cached if the
|
||||||
|
standard cache should not be used.
|
||||||
|
force_download (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to force to (re-)download the image processor files and override the cached versions if
|
||||||
|
they exist.
|
||||||
|
resume_download:
|
||||||
|
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||||
|
Will be removed in v5 of Transformers.
|
||||||
|
proxies (`Dict[str, str]`, *optional*):
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||||
|
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||||
|
token (`str` or `bool`, *optional*):
|
||||||
|
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
|
||||||
|
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
|
||||||
|
revision (`str`, *optional*, defaults to `"main"`):
|
||||||
|
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||||
|
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
||||||
|
identifier allowed by git.
|
||||||
|
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
||||||
|
If `False`, then this function returns just the final image processor object. If `True`, then this
|
||||||
|
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
||||||
|
consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
|
||||||
|
`kwargs` which has not been used to update `image_processor` and is otherwise ignored.
|
||||||
|
subfolder (`str`, *optional*, defaults to `""`):
|
||||||
|
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||||
|
specify the folder name here.
|
||||||
|
kwargs (`Dict[str, Any]`, *optional*):
|
||||||
|
The values in kwargs of any keys which are image processor attributes will be used to override the
|
||||||
|
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
|
||||||
|
controlled by the `return_unused_kwargs` keyword parameter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
|
||||||
|
# derived class: *CLIPImageProcessor*
|
||||||
|
image_processor = CLIPImageProcessor.from_pretrained(
|
||||||
|
"openai/clip-vit-base-patch32"
|
||||||
|
) # Download image_processing_config from huggingface.co and cache.
|
||||||
|
image_processor = CLIPImageProcessor.from_pretrained(
|
||||||
|
"./test/saved_model/"
|
||||||
|
) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
|
||||||
|
image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
|
||||||
|
image_processor = CLIPImageProcessor.from_pretrained(
|
||||||
|
"openai/clip-vit-base-patch32", do_normalize=False, foo=False
|
||||||
|
)
|
||||||
|
assert image_processor.do_normalize is False
|
||||||
|
image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
|
||||||
|
"openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
|
||||||
|
)
|
||||||
|
assert image_processor.do_normalize is False
|
||||||
|
assert unused_kwargs == {"foo": False}
|
||||||
|
```"""
|
||||||
|
kwargs["cache_dir"] = cache_dir
|
||||||
|
kwargs["force_download"] = force_download
|
||||||
|
kwargs["local_files_only"] = local_files_only
|
||||||
|
kwargs["revision"] = revision
|
||||||
|
|
||||||
|
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||||
|
if use_auth_token is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
if token is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||||
|
)
|
||||||
|
token = use_auth_token
|
||||||
|
|
||||||
|
if token is not None:
|
||||||
|
kwargs["token"] = token
|
||||||
|
|
||||||
|
image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
return cls.from_dict(image_processor_dict, **kwargs)
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
|
||||||
|
"""
|
||||||
|
Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
|
||||||
|
[`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (`str` or `os.PathLike`):
|
||||||
|
Directory where the image processor JSON file will be saved (will be created if it does not exist).
|
||||||
|
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||||
|
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||||
|
namespace).
|
||||||
|
kwargs (`Dict[str, Any]`, *optional*):
|
||||||
|
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||||
|
"""
|
||||||
|
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||||
|
|
||||||
|
if use_auth_token is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
if kwargs.get("token", None) is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||||
|
)
|
||||||
|
kwargs["token"] = use_auth_token
|
||||||
|
|
||||||
|
if os.path.isfile(save_directory):
|
||||||
|
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
|
||||||
|
|
||||||
|
os.makedirs(save_directory, exist_ok=True)
|
||||||
|
|
||||||
|
if push_to_hub:
|
||||||
|
commit_message = kwargs.pop("commit_message", None)
|
||||||
|
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||||
|
repo_id = self._create_repo(repo_id, **kwargs)
|
||||||
|
files_timestamps = self._get_files_timestamps(save_directory)
|
||||||
|
|
||||||
|
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
|
||||||
|
# loaded from the Hub.
|
||||||
|
if self._auto_class is not None:
|
||||||
|
custom_object_save(self, save_directory, config=self)
|
||||||
|
|
||||||
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
|
output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
|
||||||
|
|
||||||
|
self.to_json_file(output_image_processor_file)
|
||||||
|
logger.info(f"Image processor saved in {output_image_processor_file}")
|
||||||
|
|
||||||
|
if push_to_hub:
|
||||||
|
self._upload_modified_files(
|
||||||
|
save_directory,
|
||||||
|
repo_id,
|
||||||
|
files_timestamps,
|
||||||
|
commit_message=commit_message,
|
||||||
|
token=kwargs.get("token"),
|
||||||
|
)
|
||||||
|
|
||||||
|
return [output_image_processor_file]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_image_processor_dict(
|
||||||
|
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||||
|
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
|
||||||
|
image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||||
|
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||||
|
subfolder (`str`, *optional*, defaults to `""`):
|
||||||
|
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||||
|
specify the folder name here.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
|
||||||
|
"""
|
||||||
|
cache_dir = kwargs.pop("cache_dir", None)
|
||||||
|
force_download = kwargs.pop("force_download", False)
|
||||||
|
resume_download = kwargs.pop("resume_download", None)
|
||||||
|
proxies = kwargs.pop("proxies", None)
|
||||||
|
token = kwargs.pop("token", None)
|
||||||
|
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||||
|
local_files_only = kwargs.pop("local_files_only", False)
|
||||||
|
revision = kwargs.pop("revision", None)
|
||||||
|
subfolder = kwargs.pop("subfolder", "")
|
||||||
|
|
||||||
|
from_pipeline = kwargs.pop("_from_pipeline", None)
|
||||||
|
from_auto_class = kwargs.pop("_from_auto", False)
|
||||||
|
|
||||||
|
if use_auth_token is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
if token is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||||
|
)
|
||||||
|
token = use_auth_token
|
||||||
|
|
||||||
|
user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
|
||||||
|
if from_pipeline is not None:
|
||||||
|
user_agent["using_pipeline"] = from_pipeline
|
||||||
|
|
||||||
|
if is_offline_mode() and not local_files_only:
|
||||||
|
logger.info("Offline mode: forcing local_files_only=True")
|
||||||
|
local_files_only = True
|
||||||
|
|
||||||
|
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||||
|
is_local = os.path.isdir(pretrained_model_name_or_path)
|
||||||
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
|
image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
|
||||||
|
if os.path.isfile(pretrained_model_name_or_path):
|
||||||
|
resolved_image_processor_file = pretrained_model_name_or_path
|
||||||
|
is_local = True
|
||||||
|
elif is_remote_url(pretrained_model_name_or_path):
|
||||||
|
image_processor_file = pretrained_model_name_or_path
|
||||||
|
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
|
||||||
|
else:
|
||||||
|
image_processor_file = IMAGE_PROCESSOR_NAME
|
||||||
|
try:
|
||||||
|
# Load from local folder or from cache or download from model Hub and cache
|
||||||
|
resolved_image_processor_file = cached_file(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
image_processor_file,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
force_download=force_download,
|
||||||
|
proxies=proxies,
|
||||||
|
resume_download=resume_download,
|
||||||
|
local_files_only=local_files_only,
|
||||||
|
token=token,
|
||||||
|
user_agent=user_agent,
|
||||||
|
revision=revision,
|
||||||
|
subfolder=subfolder,
|
||||||
|
)
|
||||||
|
except EnvironmentError:
|
||||||
|
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||||
|
# the original exception.
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
# For any other exception, we throw a generic error.
|
||||||
|
raise EnvironmentError(
|
||||||
|
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||||
|
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||||
|
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||||
|
f" directory containing a {IMAGE_PROCESSOR_NAME} file"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load image_processor dict
|
||||||
|
with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
|
||||||
|
text = reader.read()
|
||||||
|
image_processor_dict = json.loads(text)
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise EnvironmentError(
|
||||||
|
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_local:
|
||||||
|
logger.info(f"loading configuration file {resolved_image_processor_file}")
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_local:
|
||||||
|
if "auto_map" in image_processor_dict:
|
||||||
|
image_processor_dict["auto_map"] = add_model_info_to_auto_map(
|
||||||
|
image_processor_dict["auto_map"], pretrained_model_name_or_path
|
||||||
|
)
|
||||||
|
if "custom_pipelines" in image_processor_dict:
|
||||||
|
image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
|
||||||
|
image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
|
||||||
|
)
|
||||||
|
return image_processor_dict, kwargs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
||||||
|
"""
|
||||||
|
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_processor_dict (`Dict[str, Any]`):
|
||||||
|
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
|
||||||
|
retrieved from a pretrained checkpoint by leveraging the
|
||||||
|
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
|
||||||
|
kwargs (`Dict[str, Any]`):
|
||||||
|
Additional parameters from which to initialize the image processor object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
image_processor_dict = image_processor_dict.copy()
|
||||||
|
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||||
|
|
||||||
|
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
|
||||||
|
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
|
||||||
|
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
|
||||||
|
if "size" in kwargs and "size" in image_processor_dict:
|
||||||
|
image_processor_dict["size"] = kwargs.pop("size")
|
||||||
|
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
|
||||||
|
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
|
||||||
|
|
||||||
|
image_processor = cls(**image_processor_dict)
|
||||||
|
|
||||||
|
# Update image_processor with kwargs if needed
|
||||||
|
to_remove = []
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if hasattr(image_processor, key):
|
||||||
|
setattr(image_processor, key, value)
|
||||||
|
to_remove.append(key)
|
||||||
|
for key in to_remove:
|
||||||
|
kwargs.pop(key, None)
|
||||||
|
|
||||||
|
logger.info(f"Image processor {image_processor}")
|
||||||
|
if return_unused_kwargs:
|
||||||
|
return image_processor, kwargs
|
||||||
|
else:
|
||||||
|
return image_processor
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
output["image_processor_type"] = self.__class__.__name__
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json_file(cls, json_file: Union[str, os.PathLike]):
|
||||||
|
"""
|
||||||
|
Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
|
||||||
|
file of parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_file (`str` or `os.PathLike`):
|
||||||
|
Path to the JSON file containing the parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
|
||||||
|
instantiated from that JSON file.
|
||||||
|
"""
|
||||||
|
with open(json_file, "r", encoding="utf-8") as reader:
|
||||||
|
text = reader.read()
|
||||||
|
image_processor_dict = json.loads(text)
|
||||||
|
return cls(**image_processor_dict)
|
||||||
|
|
||||||
|
def to_json_string(self) -> str:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a JSON string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
|
||||||
|
"""
|
||||||
|
dictionary = self.to_dict()
|
||||||
|
|
||||||
|
for key, value in dictionary.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
dictionary[key] = value.tolist()
|
||||||
|
|
||||||
|
# make sure private name "_processor_class" is correctly
|
||||||
|
# saved as "processor_class"
|
||||||
|
_processor_class = dictionary.pop("_processor_class", None)
|
||||||
|
if _processor_class is not None:
|
||||||
|
dictionary["processor_class"] = _processor_class
|
||||||
|
|
||||||
|
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
||||||
|
"""
|
||||||
|
Save this instance to a JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_file_path (`str` or `os.PathLike`):
|
||||||
|
Path to the JSON file in which this image_processor instance's parameters will be saved.
|
||||||
|
"""
|
||||||
|
with open(json_file_path, "w", encoding="utf-8") as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__} {self.to_json_string()}"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
|
||||||
|
"""
|
||||||
|
Register this class with a given auto class. This should only be used for custom image processors as the ones
|
||||||
|
in the library are already mapped with `AutoImageProcessor `.
|
||||||
|
|
||||||
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
This API is experimental and may have some slight breaking changes in the next releases.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
Args:
|
||||||
|
auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
|
||||||
|
The auto class to register this new image processor with.
|
||||||
|
"""
|
||||||
|
if not isinstance(auto_class, str):
|
||||||
|
auto_class = auto_class.__name__
|
||||||
|
|
||||||
|
import transformers.models.auto as auto_module
|
||||||
|
|
||||||
|
if not hasattr(auto_module, auto_class):
|
||||||
|
raise ValueError(f"{auto_class} is not a valid auto class.")
|
||||||
|
|
||||||
|
cls._auto_class = auto_class
|
||||||
|
|
||||||
|
def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
|
||||||
|
"""
|
||||||
|
Convert a single or a list of urls into the corresponding `PIL.Image` objects.
|
||||||
|
|
||||||
|
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
|
||||||
|
returned.
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
|
||||||
|
" Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if isinstance(image_url_or_urls, list):
|
||||||
|
return [self.fetch_images(x) for x in image_url_or_urls]
|
||||||
|
elif isinstance(image_url_or_urls, str):
|
||||||
|
response = requests.get(image_url_or_urls, stream=True, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
return Image.open(BytesIO(response.content))
|
||||||
|
else:
|
||||||
|
raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
|
||||||
|
|
||||||
|
|
||||||
|
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
|
||||||
|
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
|
||||||
|
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
|
||||||
|
object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
|
||||||
|
)
|
||||||
@@ -13,38 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import copy
|
from typing import Dict, Iterable, Optional, Union
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import warnings
|
|
||||||
from io import BytesIO
|
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
|
||||||
|
|
||||||
from .dynamic_module_utils import custom_object_save
|
from .image_processing_base import BatchFeature, ImageProcessingMixin
|
||||||
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
|
|
||||||
from .image_transforms import center_crop, normalize, rescale
|
from .image_transforms import center_crop, normalize, rescale
|
||||||
from .image_utils import ChannelDimension
|
from .image_utils import ChannelDimension
|
||||||
from .utils import (
|
from .utils import logging
|
||||||
IMAGE_PROCESSOR_NAME,
|
|
||||||
PushToHubMixin,
|
|
||||||
add_model_info_to_auto_map,
|
|
||||||
add_model_info_to_custom_pipelines,
|
|
||||||
cached_file,
|
|
||||||
copy_func,
|
|
||||||
download_url,
|
|
||||||
is_offline_mode,
|
|
||||||
is_remote_url,
|
|
||||||
is_vision_available,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -54,505 +32,6 @@ INIT_SERVICE_KWARGS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
|
|
||||||
# We override the class string here, but logic is the same.
|
|
||||||
class BatchFeature(BaseBatchFeature):
|
|
||||||
r"""
|
|
||||||
Holds the output of the image processor specific `__call__` methods.
|
|
||||||
|
|
||||||
This class is derived from a python dictionary and can be used as a dictionary.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data (`dict`):
|
|
||||||
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
|
|
||||||
tensor_type (`Union[None, str, TensorType]`, *optional*):
|
|
||||||
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
|
|
||||||
initialization.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: (Amy) - factor out the common parts of this and the feature extractor
|
|
||||||
class ImageProcessingMixin(PushToHubMixin):
|
|
||||||
"""
|
|
||||||
This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
|
|
||||||
extractors.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_auto_class = None
|
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
"""Set elements of `kwargs` as attributes."""
|
|
||||||
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
|
|
||||||
# `XXXImageProcessor`, this attribute and its value are misleading.
|
|
||||||
kwargs.pop("feature_extractor_type", None)
|
|
||||||
# Pop "processor_class" as it should be saved as private attribute
|
|
||||||
self._processor_class = kwargs.pop("processor_class", None)
|
|
||||||
# Additional attributes without default values
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
try:
|
|
||||||
setattr(self, key, value)
|
|
||||||
except AttributeError as err:
|
|
||||||
logger.error(f"Can't set {key} with value {value} for {self}")
|
|
||||||
raise err
|
|
||||||
|
|
||||||
def _set_processor_class(self, processor_class: str):
|
|
||||||
"""Sets processor class as an attribute."""
|
|
||||||
self._processor_class = processor_class
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(
|
|
||||||
cls,
|
|
||||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
|
||||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
|
||||||
force_download: bool = False,
|
|
||||||
local_files_only: bool = False,
|
|
||||||
token: Optional[Union[str, bool]] = None,
|
|
||||||
revision: str = "main",
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
r"""
|
|
||||||
Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained image_processor hosted inside a model repo on
|
|
||||||
huggingface.co.
|
|
||||||
- a path to a *directory* containing a image processor file saved using the
|
|
||||||
[`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
|
|
||||||
`./my_model_directory/`.
|
|
||||||
- a path or url to a saved image processor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
cache_dir (`str` or `os.PathLike`, *optional*):
|
|
||||||
Path to a directory in which a downloaded pretrained model image processor should be cached if the
|
|
||||||
standard cache should not be used.
|
|
||||||
force_download (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether or not to force to (re-)download the image processor files and override the cached versions if
|
|
||||||
they exist.
|
|
||||||
resume_download:
|
|
||||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
|
||||||
Will be removed in v5 of Transformers.
|
|
||||||
proxies (`Dict[str, str]`, *optional*):
|
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
|
||||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
|
||||||
token (`str` or `bool`, *optional*):
|
|
||||||
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
|
|
||||||
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
|
|
||||||
revision (`str`, *optional*, defaults to `"main"`):
|
|
||||||
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
|
||||||
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
|
||||||
identifier allowed by git.
|
|
||||||
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
|
||||||
If `False`, then this function returns just the final image processor object. If `True`, then this
|
|
||||||
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
|
||||||
consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
|
|
||||||
`kwargs` which has not been used to update `image_processor` and is otherwise ignored.
|
|
||||||
subfolder (`str`, *optional*, defaults to `""`):
|
|
||||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
|
||||||
specify the folder name here.
|
|
||||||
kwargs (`Dict[str, Any]`, *optional*):
|
|
||||||
The values in kwargs of any keys which are image processor attributes will be used to override the
|
|
||||||
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
|
|
||||||
controlled by the `return_unused_kwargs` keyword parameter.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
|
|
||||||
# derived class: *CLIPImageProcessor*
|
|
||||||
image_processor = CLIPImageProcessor.from_pretrained(
|
|
||||||
"openai/clip-vit-base-patch32"
|
|
||||||
) # Download image_processing_config from huggingface.co and cache.
|
|
||||||
image_processor = CLIPImageProcessor.from_pretrained(
|
|
||||||
"./test/saved_model/"
|
|
||||||
) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
|
|
||||||
image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
|
|
||||||
image_processor = CLIPImageProcessor.from_pretrained(
|
|
||||||
"openai/clip-vit-base-patch32", do_normalize=False, foo=False
|
|
||||||
)
|
|
||||||
assert image_processor.do_normalize is False
|
|
||||||
image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
|
|
||||||
"openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
|
|
||||||
)
|
|
||||||
assert image_processor.do_normalize is False
|
|
||||||
assert unused_kwargs == {"foo": False}
|
|
||||||
```"""
|
|
||||||
kwargs["cache_dir"] = cache_dir
|
|
||||||
kwargs["force_download"] = force_download
|
|
||||||
kwargs["local_files_only"] = local_files_only
|
|
||||||
kwargs["revision"] = revision
|
|
||||||
|
|
||||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
|
||||||
if use_auth_token is not None:
|
|
||||||
warnings.warn(
|
|
||||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
|
||||||
FutureWarning,
|
|
||||||
)
|
|
||||||
if token is not None:
|
|
||||||
raise ValueError(
|
|
||||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
|
||||||
)
|
|
||||||
token = use_auth_token
|
|
||||||
|
|
||||||
if token is not None:
|
|
||||||
kwargs["token"] = token
|
|
||||||
|
|
||||||
image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls.from_dict(image_processor_dict, **kwargs)
|
|
||||||
|
|
||||||
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
|
|
||||||
"""
|
|
||||||
Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
|
|
||||||
[`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the image processor JSON file will be saved (will be created if it does not exist).
|
|
||||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
|
||||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
|
||||||
namespace).
|
|
||||||
kwargs (`Dict[str, Any]`, *optional*):
|
|
||||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
|
||||||
"""
|
|
||||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
|
||||||
|
|
||||||
if use_auth_token is not None:
|
|
||||||
warnings.warn(
|
|
||||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
|
||||||
FutureWarning,
|
|
||||||
)
|
|
||||||
if kwargs.get("token", None) is not None:
|
|
||||||
raise ValueError(
|
|
||||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
|
||||||
)
|
|
||||||
kwargs["token"] = use_auth_token
|
|
||||||
|
|
||||||
if os.path.isfile(save_directory):
|
|
||||||
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
|
|
||||||
|
|
||||||
os.makedirs(save_directory, exist_ok=True)
|
|
||||||
|
|
||||||
if push_to_hub:
|
|
||||||
commit_message = kwargs.pop("commit_message", None)
|
|
||||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
|
||||||
repo_id = self._create_repo(repo_id, **kwargs)
|
|
||||||
files_timestamps = self._get_files_timestamps(save_directory)
|
|
||||||
|
|
||||||
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
|
|
||||||
# loaded from the Hub.
|
|
||||||
if self._auto_class is not None:
|
|
||||||
custom_object_save(self, save_directory, config=self)
|
|
||||||
|
|
||||||
# If we save using the predefined names, we can load using `from_pretrained`
|
|
||||||
output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
|
|
||||||
|
|
||||||
self.to_json_file(output_image_processor_file)
|
|
||||||
logger.info(f"Image processor saved in {output_image_processor_file}")
|
|
||||||
|
|
||||||
if push_to_hub:
|
|
||||||
self._upload_modified_files(
|
|
||||||
save_directory,
|
|
||||||
repo_id,
|
|
||||||
files_timestamps,
|
|
||||||
commit_message=commit_message,
|
|
||||||
token=kwargs.get("token"),
|
|
||||||
)
|
|
||||||
|
|
||||||
return [output_image_processor_file]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_image_processor_dict(
|
|
||||||
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
|
||||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
|
|
||||||
image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
|
||||||
subfolder (`str`, *optional*, defaults to `""`):
|
|
||||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
|
||||||
specify the folder name here.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
|
|
||||||
"""
|
|
||||||
cache_dir = kwargs.pop("cache_dir", None)
|
|
||||||
force_download = kwargs.pop("force_download", False)
|
|
||||||
resume_download = kwargs.pop("resume_download", None)
|
|
||||||
proxies = kwargs.pop("proxies", None)
|
|
||||||
token = kwargs.pop("token", None)
|
|
||||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
|
||||||
local_files_only = kwargs.pop("local_files_only", False)
|
|
||||||
revision = kwargs.pop("revision", None)
|
|
||||||
subfolder = kwargs.pop("subfolder", "")
|
|
||||||
|
|
||||||
from_pipeline = kwargs.pop("_from_pipeline", None)
|
|
||||||
from_auto_class = kwargs.pop("_from_auto", False)
|
|
||||||
|
|
||||||
if use_auth_token is not None:
|
|
||||||
warnings.warn(
|
|
||||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
|
||||||
FutureWarning,
|
|
||||||
)
|
|
||||||
if token is not None:
|
|
||||||
raise ValueError(
|
|
||||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
|
||||||
)
|
|
||||||
token = use_auth_token
|
|
||||||
|
|
||||||
user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
|
|
||||||
if from_pipeline is not None:
|
|
||||||
user_agent["using_pipeline"] = from_pipeline
|
|
||||||
|
|
||||||
if is_offline_mode() and not local_files_only:
|
|
||||||
logger.info("Offline mode: forcing local_files_only=True")
|
|
||||||
local_files_only = True
|
|
||||||
|
|
||||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
|
||||||
is_local = os.path.isdir(pretrained_model_name_or_path)
|
|
||||||
if os.path.isdir(pretrained_model_name_or_path):
|
|
||||||
image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
|
|
||||||
if os.path.isfile(pretrained_model_name_or_path):
|
|
||||||
resolved_image_processor_file = pretrained_model_name_or_path
|
|
||||||
is_local = True
|
|
||||||
elif is_remote_url(pretrained_model_name_or_path):
|
|
||||||
image_processor_file = pretrained_model_name_or_path
|
|
||||||
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
|
|
||||||
else:
|
|
||||||
image_processor_file = IMAGE_PROCESSOR_NAME
|
|
||||||
try:
|
|
||||||
# Load from local folder or from cache or download from model Hub and cache
|
|
||||||
resolved_image_processor_file = cached_file(
|
|
||||||
pretrained_model_name_or_path,
|
|
||||||
image_processor_file,
|
|
||||||
cache_dir=cache_dir,
|
|
||||||
force_download=force_download,
|
|
||||||
proxies=proxies,
|
|
||||||
resume_download=resume_download,
|
|
||||||
local_files_only=local_files_only,
|
|
||||||
token=token,
|
|
||||||
user_agent=user_agent,
|
|
||||||
revision=revision,
|
|
||||||
subfolder=subfolder,
|
|
||||||
)
|
|
||||||
except EnvironmentError:
|
|
||||||
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
|
||||||
# the original exception.
|
|
||||||
raise
|
|
||||||
except Exception:
|
|
||||||
# For any other exception, we throw a generic error.
|
|
||||||
raise EnvironmentError(
|
|
||||||
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
|
||||||
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
|
||||||
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
|
||||||
f" directory containing a {IMAGE_PROCESSOR_NAME} file"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Load image_processor dict
|
|
||||||
with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
|
|
||||||
text = reader.read()
|
|
||||||
image_processor_dict = json.loads(text)
|
|
||||||
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
raise EnvironmentError(
|
|
||||||
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
|
|
||||||
)
|
|
||||||
|
|
||||||
if is_local:
|
|
||||||
logger.info(f"loading configuration file {resolved_image_processor_file}")
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_local:
|
|
||||||
if "auto_map" in image_processor_dict:
|
|
||||||
image_processor_dict["auto_map"] = add_model_info_to_auto_map(
|
|
||||||
image_processor_dict["auto_map"], pretrained_model_name_or_path
|
|
||||||
)
|
|
||||||
if "custom_pipelines" in image_processor_dict:
|
|
||||||
image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
|
|
||||||
image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
|
|
||||||
)
|
|
||||||
return image_processor_dict, kwargs
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image_processor_dict (`Dict[str, Any]`):
|
|
||||||
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
|
|
||||||
retrieved from a pretrained checkpoint by leveraging the
|
|
||||||
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
|
|
||||||
kwargs (`Dict[str, Any]`):
|
|
||||||
Additional parameters from which to initialize the image processor object.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
|
|
||||||
parameters.
|
|
||||||
"""
|
|
||||||
image_processor_dict = image_processor_dict.copy()
|
|
||||||
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
|
||||||
|
|
||||||
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
|
|
||||||
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
|
|
||||||
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
|
|
||||||
if "size" in kwargs and "size" in image_processor_dict:
|
|
||||||
image_processor_dict["size"] = kwargs.pop("size")
|
|
||||||
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
|
|
||||||
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
|
|
||||||
|
|
||||||
image_processor = cls(**image_processor_dict)
|
|
||||||
|
|
||||||
# Update image_processor with kwargs if needed
|
|
||||||
to_remove = []
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
if hasattr(image_processor, key):
|
|
||||||
setattr(image_processor, key, value)
|
|
||||||
to_remove.append(key)
|
|
||||||
for key in to_remove:
|
|
||||||
kwargs.pop(key, None)
|
|
||||||
|
|
||||||
logger.info(f"Image processor {image_processor}")
|
|
||||||
if return_unused_kwargs:
|
|
||||||
return image_processor, kwargs
|
|
||||||
else:
|
|
||||||
return image_processor
|
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Serializes this instance to a Python dictionary.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
|
|
||||||
"""
|
|
||||||
output = copy.deepcopy(self.__dict__)
|
|
||||||
output["image_processor_type"] = self.__class__.__name__
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_json_file(cls, json_file: Union[str, os.PathLike]):
|
|
||||||
"""
|
|
||||||
Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
|
|
||||||
file of parameters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
json_file (`str` or `os.PathLike`):
|
|
||||||
Path to the JSON file containing the parameters.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
|
|
||||||
instantiated from that JSON file.
|
|
||||||
"""
|
|
||||||
with open(json_file, "r", encoding="utf-8") as reader:
|
|
||||||
text = reader.read()
|
|
||||||
image_processor_dict = json.loads(text)
|
|
||||||
return cls(**image_processor_dict)
|
|
||||||
|
|
||||||
def to_json_string(self) -> str:
|
|
||||||
"""
|
|
||||||
Serializes this instance to a JSON string.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
|
|
||||||
"""
|
|
||||||
dictionary = self.to_dict()
|
|
||||||
|
|
||||||
for key, value in dictionary.items():
|
|
||||||
if isinstance(value, np.ndarray):
|
|
||||||
dictionary[key] = value.tolist()
|
|
||||||
|
|
||||||
# make sure private name "_processor_class" is correctly
|
|
||||||
# saved as "processor_class"
|
|
||||||
_processor_class = dictionary.pop("_processor_class", None)
|
|
||||||
if _processor_class is not None:
|
|
||||||
dictionary["processor_class"] = _processor_class
|
|
||||||
|
|
||||||
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
|
|
||||||
|
|
||||||
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
|
||||||
"""
|
|
||||||
Save this instance to a JSON file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
json_file_path (`str` or `os.PathLike`):
|
|
||||||
Path to the JSON file in which this image_processor instance's parameters will be saved.
|
|
||||||
"""
|
|
||||||
with open(json_file_path, "w", encoding="utf-8") as writer:
|
|
||||||
writer.write(self.to_json_string())
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"{self.__class__.__name__} {self.to_json_string()}"
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
|
|
||||||
"""
|
|
||||||
Register this class with a given auto class. This should only be used for custom image processors as the ones
|
|
||||||
in the library are already mapped with `AutoImageProcessor `.
|
|
||||||
|
|
||||||
<Tip warning={true}>
|
|
||||||
|
|
||||||
This API is experimental and may have some slight breaking changes in the next releases.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
|
|
||||||
The auto class to register this new image processor with.
|
|
||||||
"""
|
|
||||||
if not isinstance(auto_class, str):
|
|
||||||
auto_class = auto_class.__name__
|
|
||||||
|
|
||||||
import transformers.models.auto as auto_module
|
|
||||||
|
|
||||||
if not hasattr(auto_module, auto_class):
|
|
||||||
raise ValueError(f"{auto_class} is not a valid auto class.")
|
|
||||||
|
|
||||||
cls._auto_class = auto_class
|
|
||||||
|
|
||||||
def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
|
|
||||||
"""
|
|
||||||
Convert a single or a list of urls into the corresponding `PIL.Image` objects.
|
|
||||||
|
|
||||||
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
|
|
||||||
returned.
|
|
||||||
"""
|
|
||||||
headers = {
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
|
|
||||||
" Safari/537.36"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if isinstance(image_url_or_urls, list):
|
|
||||||
return [self.fetch_images(x) for x in image_url_or_urls]
|
|
||||||
elif isinstance(image_url_or_urls, str):
|
|
||||||
response = requests.get(image_url_or_urls, stream=True, headers=headers)
|
|
||||||
response.raise_for_status()
|
|
||||||
return Image.open(BytesIO(response.content))
|
|
||||||
else:
|
|
||||||
raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
|
|
||||||
|
|
||||||
|
|
||||||
class BaseImageProcessor(ImageProcessingMixin):
|
class BaseImageProcessor(ImageProcessingMixin):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -801,10 +280,3 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
|
|||||||
best_fit = (height, width)
|
best_fit = (height, width)
|
||||||
|
|
||||||
return best_fit
|
return best_fit
|
||||||
|
|
||||||
|
|
||||||
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
|
|
||||||
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
|
|
||||||
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
|
|
||||||
object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
|
|
||||||
)
|
|
||||||
|
|||||||
63
src/transformers/image_processing_utils_fast.py
Normal file
63
src/transformers/image_processing_utils_fast.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import functools
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from .image_processing_utils import BaseImageProcessor
|
||||||
|
from .utils.import_utils import is_torchvision_available
|
||||||
|
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from torchvision.transforms import Compose
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SizeDict:
|
||||||
|
"""
|
||||||
|
Hashable dictionary to store image size information.
|
||||||
|
"""
|
||||||
|
|
||||||
|
height: int = None
|
||||||
|
width: int = None
|
||||||
|
longest_edge: int = None
|
||||||
|
shortest_edge: int = None
|
||||||
|
max_height: int = None
|
||||||
|
max_width: int = None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
if hasattr(self, key):
|
||||||
|
return getattr(self, key)
|
||||||
|
raise KeyError(f"Key {key} not found in SizeDict.")
|
||||||
|
|
||||||
|
|
||||||
|
class BaseImageProcessorFast(BaseImageProcessor):
|
||||||
|
_transform_params = None
|
||||||
|
|
||||||
|
def _build_transforms(self, **kwargs) -> "Compose":
|
||||||
|
"""
|
||||||
|
Given the input settings e.g. do_resize, build the image transforms.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _validate_params(self, **kwargs) -> None:
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
if k not in self._transform_params:
|
||||||
|
raise ValueError(f"Invalid transform parameter {k}={v}.")
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=1)
|
||||||
|
def get_transforms(self, **kwargs) -> "Compose":
|
||||||
|
self._validate_params(**kwargs)
|
||||||
|
return self._build_transforms(**kwargs)
|
||||||
@@ -31,6 +31,7 @@ from .utils.import_utils import (
|
|||||||
is_flax_available,
|
is_flax_available,
|
||||||
is_tf_available,
|
is_tf_available,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
|
is_torchvision_available,
|
||||||
is_vision_available,
|
is_vision_available,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
)
|
)
|
||||||
@@ -50,6 +51,9 @@ if is_tf_available():
|
|||||||
if is_flax_available():
|
if is_flax_available():
|
||||||
import jax.numpy as jnp
|
import jax.numpy as jnp
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
def to_channel_dimension_format(
|
def to_channel_dimension_format(
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -374,6 +378,7 @@ def normalize(
|
|||||||
|
|
||||||
if input_data_format is None:
|
if input_data_format is None:
|
||||||
input_data_format = infer_channel_dimension_format(image)
|
input_data_format = infer_channel_dimension_format(image)
|
||||||
|
|
||||||
channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
|
channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
|
||||||
num_channels = image.shape[channel_axis]
|
num_channels = image.shape[channel_axis]
|
||||||
|
|
||||||
@@ -802,3 +807,48 @@ def flip_channel_order(
|
|||||||
if data_format is not None:
|
if data_format is not None:
|
||||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def _cast_tensor_to_float(x):
|
||||||
|
if x.is_floating_point():
|
||||||
|
return x
|
||||||
|
return x.float()
|
||||||
|
|
||||||
|
|
||||||
|
class FusedRescaleNormalize:
|
||||||
|
"""
|
||||||
|
Rescale and normalize the input image in one step.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mean, std, rescale_factor: float = 1.0, inplace: bool = False):
|
||||||
|
self.mean = torch.tensor(mean) * (1.0 / rescale_factor)
|
||||||
|
self.std = torch.tensor(std) * (1.0 / rescale_factor)
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def __call__(self, image: "torch.Tensor"):
|
||||||
|
image = _cast_tensor_to_float(image)
|
||||||
|
return F.normalize(image, self.mean, self.std, inplace=self.inplace)
|
||||||
|
|
||||||
|
|
||||||
|
class Rescale:
|
||||||
|
"""
|
||||||
|
Rescale the input image by rescale factor: image *= rescale_factor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rescale_factor: float = 1.0):
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
|
||||||
|
def __call__(self, image: "torch.Tensor"):
|
||||||
|
image = image * self.rescale_factor
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
class NumpyToTensor:
|
||||||
|
"""
|
||||||
|
Convert a numpy array to a PyTorch tensor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __call__(self, image: np.ndarray):
|
||||||
|
# Same as in PyTorch, we assume incoming numpy images are in HWC format
|
||||||
|
# c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
|
||||||
|
return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
|
||||||
|
|||||||
@@ -25,9 +25,11 @@ from packaging import version
|
|||||||
from .utils import (
|
from .utils import (
|
||||||
ExplicitEnum,
|
ExplicitEnum,
|
||||||
is_jax_tensor,
|
is_jax_tensor,
|
||||||
|
is_numpy_array,
|
||||||
is_tf_tensor,
|
is_tf_tensor,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torch_tensor,
|
is_torch_tensor,
|
||||||
|
is_torchvision_available,
|
||||||
is_vision_available,
|
is_vision_available,
|
||||||
logging,
|
logging,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
@@ -52,6 +54,20 @@ if is_vision_available():
|
|||||||
else:
|
else:
|
||||||
PILImageResampling = PIL.Image
|
PILImageResampling = PIL.Image
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from torchvision.transforms import InterpolationMode
|
||||||
|
|
||||||
|
pil_torch_interpolation_mapping = {
|
||||||
|
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
|
||||||
|
PILImageResampling.BOX: InterpolationMode.BOX,
|
||||||
|
PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
|
||||||
|
PILImageResampling.HAMMING: InterpolationMode.HAMMING,
|
||||||
|
PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
|
||||||
|
PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
|
||||||
|
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
@@ -90,14 +106,30 @@ def is_pil_image(img):
|
|||||||
return is_vision_available() and isinstance(img, PIL.Image.Image)
|
return is_vision_available() and isinstance(img, PIL.Image.Image)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageType(ExplicitEnum):
|
||||||
|
PIL = "pillow"
|
||||||
|
TORCH = "torch"
|
||||||
|
NUMPY = "numpy"
|
||||||
|
TENSORFLOW = "tensorflow"
|
||||||
|
JAX = "jax"
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_type(image):
|
||||||
|
if is_pil_image(image):
|
||||||
|
return ImageType.PIL
|
||||||
|
if is_torch_tensor(image):
|
||||||
|
return ImageType.TORCH
|
||||||
|
if is_numpy_array(image):
|
||||||
|
return ImageType.NUMPY
|
||||||
|
if is_tf_tensor(image):
|
||||||
|
return ImageType.TENSORFLOW
|
||||||
|
if is_jax_tensor(image):
|
||||||
|
return ImageType.JAX
|
||||||
|
raise ValueError(f"Unrecognised image type {type(image)}")
|
||||||
|
|
||||||
|
|
||||||
def is_valid_image(img):
|
def is_valid_image(img):
|
||||||
return (
|
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
|
||||||
(is_vision_available() and isinstance(img, PIL.Image.Image))
|
|
||||||
or isinstance(img, np.ndarray)
|
|
||||||
or is_torch_tensor(img)
|
|
||||||
or is_tf_tensor(img)
|
|
||||||
or is_jax_tensor(img)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def valid_images(imgs):
|
def valid_images(imgs):
|
||||||
|
|||||||
@@ -19,13 +19,21 @@ import json
|
|||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Dict, Optional, Union
|
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
||||||
|
|
||||||
# Build the list of all image processors
|
# Build the list of all image processors
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||||
from ...image_processing_utils import ImageProcessingMixin
|
from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
|
||||||
from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
|
from ...image_processing_utils_fast import BaseImageProcessorFast
|
||||||
|
from ...utils import (
|
||||||
|
CONFIG_NAME,
|
||||||
|
IMAGE_PROCESSOR_NAME,
|
||||||
|
get_file_from_repo,
|
||||||
|
is_torchvision_available,
|
||||||
|
is_vision_available,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
from .auto_factory import _LazyAutoMapping
|
from .auto_factory import _LazyAutoMapping
|
||||||
from .configuration_auto import (
|
from .configuration_auto import (
|
||||||
CONFIG_MAPPING_NAMES,
|
CONFIG_MAPPING_NAMES,
|
||||||
@@ -37,104 +45,125 @@ from .configuration_auto import (
|
|||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|
||||||
[
|
if TYPE_CHECKING:
|
||||||
("align", "EfficientNetImageProcessor"),
|
# This significantly improves completion suggestion performance when
|
||||||
("beit", "BeitImageProcessor"),
|
# the transformers package is used with Microsoft's Pylance language server.
|
||||||
("bit", "BitImageProcessor"),
|
IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
|
||||||
("blip", "BlipImageProcessor"),
|
else:
|
||||||
("blip-2", "BlipImageProcessor"),
|
IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||||
("bridgetower", "BridgeTowerImageProcessor"),
|
[
|
||||||
("chinese_clip", "ChineseCLIPImageProcessor"),
|
("align", ("EfficientNetImageProcessor",)),
|
||||||
("clip", "CLIPImageProcessor"),
|
("beit", ("BeitImageProcessor",)),
|
||||||
("clipseg", "ViTImageProcessor"),
|
("bit", ("BitImageProcessor",)),
|
||||||
("conditional_detr", "ConditionalDetrImageProcessor"),
|
("blip", ("BlipImageProcessor",)),
|
||||||
("convnext", "ConvNextImageProcessor"),
|
("blip-2", ("BlipImageProcessor",)),
|
||||||
("convnextv2", "ConvNextImageProcessor"),
|
("bridgetower", ("BridgeTowerImageProcessor",)),
|
||||||
("cvt", "ConvNextImageProcessor"),
|
("chinese_clip", ("ChineseCLIPImageProcessor",)),
|
||||||
("data2vec-vision", "BeitImageProcessor"),
|
("clip", ("CLIPImageProcessor",)),
|
||||||
("deformable_detr", "DeformableDetrImageProcessor"),
|
("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("deit", "DeiTImageProcessor"),
|
("conditional_detr", ("ConditionalDetrImageProcessor",)),
|
||||||
("depth_anything", "DPTImageProcessor"),
|
("convnext", ("ConvNextImageProcessor",)),
|
||||||
("deta", "DetaImageProcessor"),
|
("convnextv2", ("ConvNextImageProcessor",)),
|
||||||
("detr", "DetrImageProcessor"),
|
("cvt", ("ConvNextImageProcessor",)),
|
||||||
("dinat", "ViTImageProcessor"),
|
("data2vec-vision", ("BeitImageProcessor",)),
|
||||||
("dinov2", "BitImageProcessor"),
|
("deformable_detr", ("DeformableDetrImageProcessor",)),
|
||||||
("donut-swin", "DonutImageProcessor"),
|
("deit", ("DeiTImageProcessor",)),
|
||||||
("dpt", "DPTImageProcessor"),
|
("depth_anything", ("DPTImageProcessor",)),
|
||||||
("efficientformer", "EfficientFormerImageProcessor"),
|
("deta", ("DetaImageProcessor",)),
|
||||||
("efficientnet", "EfficientNetImageProcessor"),
|
("detr", ("DetrImageProcessor",)),
|
||||||
("flava", "FlavaImageProcessor"),
|
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("focalnet", "BitImageProcessor"),
|
("dinov2", ("BitImageProcessor",)),
|
||||||
("fuyu", "FuyuImageProcessor"),
|
("donut-swin", ("DonutImageProcessor",)),
|
||||||
("git", "CLIPImageProcessor"),
|
("dpt", ("DPTImageProcessor",)),
|
||||||
("glpn", "GLPNImageProcessor"),
|
("efficientformer", ("EfficientFormerImageProcessor",)),
|
||||||
("grounding-dino", "GroundingDinoImageProcessor"),
|
("efficientnet", ("EfficientNetImageProcessor",)),
|
||||||
("groupvit", "CLIPImageProcessor"),
|
("flava", ("FlavaImageProcessor",)),
|
||||||
("idefics", "IdeficsImageProcessor"),
|
("focalnet", ("BitImageProcessor",)),
|
||||||
("idefics2", "Idefics2ImageProcessor"),
|
("fuyu", ("FuyuImageProcessor",)),
|
||||||
("imagegpt", "ImageGPTImageProcessor"),
|
("git", ("CLIPImageProcessor",)),
|
||||||
("instructblip", "BlipImageProcessor"),
|
("glpn", ("GLPNImageProcessor",)),
|
||||||
("kosmos-2", "CLIPImageProcessor"),
|
("grounding-dino", ("GroundingDinoImageProcessor",)),
|
||||||
("layoutlmv2", "LayoutLMv2ImageProcessor"),
|
("groupvit", ("CLIPImageProcessor",)),
|
||||||
("layoutlmv3", "LayoutLMv3ImageProcessor"),
|
("idefics", ("IdeficsImageProcessor",)),
|
||||||
("levit", "LevitImageProcessor"),
|
("idefics2", ("Idefics2ImageProcessor",)),
|
||||||
("llava", "CLIPImageProcessor"),
|
("imagegpt", ("ImageGPTImageProcessor",)),
|
||||||
("llava_next", "LlavaNextImageProcessor"),
|
("instructblip", ("BlipImageProcessor",)),
|
||||||
("mask2former", "Mask2FormerImageProcessor"),
|
("kosmos-2", ("CLIPImageProcessor",)),
|
||||||
("maskformer", "MaskFormerImageProcessor"),
|
("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
|
||||||
("mgp-str", "ViTImageProcessor"),
|
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
|
||||||
("mobilenet_v1", "MobileNetV1ImageProcessor"),
|
("levit", ("LevitImageProcessor",)),
|
||||||
("mobilenet_v2", "MobileNetV2ImageProcessor"),
|
("llava", ("CLIPImageProcessor",)),
|
||||||
("mobilevit", "MobileViTImageProcessor"),
|
("llava_next", ("LlavaNextImageProcessor",)),
|
||||||
("mobilevit", "MobileViTImageProcessor"),
|
("mask2former", ("Mask2FormerImageProcessor",)),
|
||||||
("mobilevitv2", "MobileViTImageProcessor"),
|
("maskformer", ("MaskFormerImageProcessor",)),
|
||||||
("nat", "ViTImageProcessor"),
|
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("nougat", "NougatImageProcessor"),
|
("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
|
||||||
("oneformer", "OneFormerImageProcessor"),
|
("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
|
||||||
("owlv2", "Owlv2ImageProcessor"),
|
("mobilevit", ("MobileViTImageProcessor",)),
|
||||||
("owlvit", "OwlViTImageProcessor"),
|
("mobilevit", ("MobileViTImageProcessor",)),
|
||||||
("paligemma", "CLIPImageProcessor"),
|
("mobilevitv2", ("MobileViTImageProcessor",)),
|
||||||
("perceiver", "PerceiverImageProcessor"),
|
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("pix2struct", "Pix2StructImageProcessor"),
|
("nougat", ("NougatImageProcessor",)),
|
||||||
("poolformer", "PoolFormerImageProcessor"),
|
("oneformer", ("OneFormerImageProcessor",)),
|
||||||
("pvt", "PvtImageProcessor"),
|
("owlv2", ("Owlv2ImageProcessor",)),
|
||||||
("pvt_v2", "PvtImageProcessor"),
|
("owlvit", ("OwlViTImageProcessor",)),
|
||||||
("regnet", "ConvNextImageProcessor"),
|
("perceiver", ("PerceiverImageProcessor",)),
|
||||||
("resnet", "ConvNextImageProcessor"),
|
("pix2struct", ("Pix2StructImageProcessor",)),
|
||||||
("sam", "SamImageProcessor"),
|
("poolformer", ("PoolFormerImageProcessor",)),
|
||||||
("segformer", "SegformerImageProcessor"),
|
("pvt", ("PvtImageProcessor",)),
|
||||||
("seggpt", "SegGptImageProcessor"),
|
("pvt_v2", ("PvtImageProcessor",)),
|
||||||
("siglip", "SiglipImageProcessor"),
|
("regnet", ("ConvNextImageProcessor",)),
|
||||||
("swiftformer", "ViTImageProcessor"),
|
("resnet", ("ConvNextImageProcessor",)),
|
||||||
("swin", "ViTImageProcessor"),
|
("sam", ("SamImageProcessor",)),
|
||||||
("swin2sr", "Swin2SRImageProcessor"),
|
("segformer", ("SegformerImageProcessor",)),
|
||||||
("swinv2", "ViTImageProcessor"),
|
("seggpt", ("SegGptImageProcessor",)),
|
||||||
("table-transformer", "DetrImageProcessor"),
|
("siglip", ("SiglipImageProcessor",)),
|
||||||
("timesformer", "VideoMAEImageProcessor"),
|
("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("tvlt", "TvltImageProcessor"),
|
("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("tvp", "TvpImageProcessor"),
|
("swin2sr", ("Swin2SRImageProcessor",)),
|
||||||
("udop", "LayoutLMv3ImageProcessor"),
|
("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("upernet", "SegformerImageProcessor"),
|
("table-transformer", ("DetrImageProcessor",)),
|
||||||
("van", "ConvNextImageProcessor"),
|
("timesformer", ("VideoMAEImageProcessor",)),
|
||||||
("video_llava", "VideoLlavaImageProcessor"),
|
("tvlt", ("TvltImageProcessor",)),
|
||||||
("videomae", "VideoMAEImageProcessor"),
|
("tvp", ("TvpImageProcessor",)),
|
||||||
("vilt", "ViltImageProcessor"),
|
("udop", ("LayoutLMv3ImageProcessor",)),
|
||||||
("vipllava", "CLIPImageProcessor"),
|
("upernet", ("SegformerImageProcessor",)),
|
||||||
("vit", "ViTImageProcessor"),
|
("van", ("ConvNextImageProcessor",)),
|
||||||
("vit_hybrid", "ViTHybridImageProcessor"),
|
("videomae", ("VideoMAEImageProcessor",)),
|
||||||
("vit_mae", "ViTImageProcessor"),
|
("vilt", ("ViltImageProcessor",)),
|
||||||
("vit_msn", "ViTImageProcessor"),
|
("vipllava", ("CLIPImageProcessor",)),
|
||||||
("vitmatte", "VitMatteImageProcessor"),
|
("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
("xclip", "CLIPImageProcessor"),
|
("vit_hybrid", ("ViTHybridImageProcessor",)),
|
||||||
("yolos", "YolosImageProcessor"),
|
("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
]
|
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||||
)
|
("vitmatte", ("VitMatteImageProcessor",)),
|
||||||
|
("xclip", ("CLIPImageProcessor",)),
|
||||||
|
("yolos", ("YolosImageProcessor",)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
for model_type, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
|
||||||
|
slow_image_processor_class, *fast_image_processor_class = image_processors
|
||||||
|
if not is_vision_available():
|
||||||
|
slow_image_processor_class = None
|
||||||
|
|
||||||
|
# If the fast image processor is not defined, or torchvision is not available, we set it to None
|
||||||
|
if not fast_image_processor_class or fast_image_processor_class[0] is None or not is_torchvision_available():
|
||||||
|
fast_image_processor_class = None
|
||||||
|
else:
|
||||||
|
fast_image_processor_class = fast_image_processor_class[0]
|
||||||
|
|
||||||
|
IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class)
|
||||||
|
|
||||||
|
|
||||||
IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
|
IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
|
||||||
|
|
||||||
|
|
||||||
def image_processor_class_from_name(class_name: str):
|
def image_processor_class_from_name(class_name: str):
|
||||||
|
if class_name == "BaseImageProcessorFast":
|
||||||
|
return BaseImageProcessorFast
|
||||||
|
|
||||||
for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
|
for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
|
||||||
if class_name in extractors:
|
if class_name in extractors:
|
||||||
module_name = model_type_to_module_name(module_name)
|
module_name = model_type_to_module_name(module_name)
|
||||||
@@ -145,11 +174,12 @@ def image_processor_class_from_name(class_name: str):
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for _, extractor in IMAGE_PROCESSOR_MAPPING._extra_content.items():
|
for _, extractors in IMAGE_PROCESSOR_MAPPING._extra_content.items():
|
||||||
if getattr(extractor, "__name__", None) == class_name:
|
for extractor in extractors:
|
||||||
return extractor
|
if getattr(extractor, "__name__", None) == class_name:
|
||||||
|
return extractor
|
||||||
|
|
||||||
# We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
|
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
|
||||||
# init and we return the proper dummy to get an appropriate error message.
|
# init and we return the proper dummy to get an appropriate error message.
|
||||||
main_module = importlib.import_module("transformers")
|
main_module = importlib.import_module("transformers")
|
||||||
if hasattr(main_module, class_name):
|
if hasattr(main_module, class_name):
|
||||||
@@ -258,6 +288,13 @@ def get_image_processor_config(
|
|||||||
return json.load(reader)
|
return json.load(reader)
|
||||||
|
|
||||||
|
|
||||||
|
def _warning_fast_image_processor_available(fast_class):
|
||||||
|
logger.warning(
|
||||||
|
f"Fast image processor class {fast_class} is available for this model. "
|
||||||
|
"Using slow image processor class. To use the fast image processor class set `use_fast=True`."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AutoImageProcessor:
|
class AutoImageProcessor:
|
||||||
r"""
|
r"""
|
||||||
This is a generic image processor class that will be instantiated as one of the image processor classes of the
|
This is a generic image processor class that will be instantiated as one of the image processor classes of the
|
||||||
@@ -274,7 +311,7 @@ class AutoImageProcessor:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
|
@replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||||
r"""
|
r"""
|
||||||
Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
|
Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
|
||||||
|
|
||||||
@@ -314,6 +351,10 @@ class AutoImageProcessor:
|
|||||||
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||||
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
||||||
identifier allowed by git.
|
identifier allowed by git.
|
||||||
|
use_fast (`bool`, *optional*, defaults to `False`):
|
||||||
|
Use a fast torchvision-base image processor if it is supported for a given model.
|
||||||
|
If a fast tokenizer is not available for a given model, a normal numpy-based image processor
|
||||||
|
is returned instead.
|
||||||
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
||||||
If `False`, then this function returns just the final image processor object. If `True`, then this
|
If `False`, then this function returns just the final image processor object. If `True`, then this
|
||||||
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
||||||
@@ -358,6 +399,7 @@ class AutoImageProcessor:
|
|||||||
kwargs["token"] = use_auth_token
|
kwargs["token"] = use_auth_token
|
||||||
|
|
||||||
config = kwargs.pop("config", None)
|
config = kwargs.pop("config", None)
|
||||||
|
use_fast = kwargs.pop("use_fast", False)
|
||||||
trust_remote_code = kwargs.pop("trust_remote_code", None)
|
trust_remote_code = kwargs.pop("trust_remote_code", None)
|
||||||
kwargs["_from_auto"] = True
|
kwargs["_from_auto"] = True
|
||||||
|
|
||||||
@@ -387,6 +429,11 @@ class AutoImageProcessor:
|
|||||||
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
|
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
|
||||||
|
|
||||||
if image_processor_class is not None:
|
if image_processor_class is not None:
|
||||||
|
# Update class name to reflect the use_fast option. If class is not found, None is returned.
|
||||||
|
if use_fast and not image_processor_class.endswith("Fast"):
|
||||||
|
image_processor_class += "Fast"
|
||||||
|
elif not use_fast and image_processor_class.endswith("Fast"):
|
||||||
|
image_processor_class = image_processor_class[:-4]
|
||||||
image_processor_class = image_processor_class_from_name(image_processor_class)
|
image_processor_class = image_processor_class_from_name(image_processor_class)
|
||||||
|
|
||||||
has_remote_code = image_processor_auto_map is not None
|
has_remote_code = image_processor_auto_map is not None
|
||||||
@@ -395,10 +442,19 @@ class AutoImageProcessor:
|
|||||||
trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
|
trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple):
|
||||||
|
# In some configs, only the slow image processor class is stored
|
||||||
|
image_processor_auto_map = (image_processor_auto_map, None)
|
||||||
|
|
||||||
if has_remote_code and trust_remote_code:
|
if has_remote_code and trust_remote_code:
|
||||||
image_processor_class = get_class_from_dynamic_module(
|
if not use_fast and image_processor_auto_map[1] is not None:
|
||||||
image_processor_auto_map, pretrained_model_name_or_path, **kwargs
|
_warning_fast_image_processor_available(image_processor_auto_map[1])
|
||||||
)
|
|
||||||
|
if use_fast and image_processor_auto_map[1] is not None:
|
||||||
|
class_ref = image_processor_auto_map[1]
|
||||||
|
else:
|
||||||
|
class_ref = image_processor_auto_map[0]
|
||||||
|
image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
|
||||||
_ = kwargs.pop("code_revision", None)
|
_ = kwargs.pop("code_revision", None)
|
||||||
if os.path.isdir(pretrained_model_name_or_path):
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
image_processor_class.register_for_auto_class()
|
image_processor_class.register_for_auto_class()
|
||||||
@@ -407,8 +463,22 @@ class AutoImageProcessor:
|
|||||||
return image_processor_class.from_dict(config_dict, **kwargs)
|
return image_processor_class.from_dict(config_dict, **kwargs)
|
||||||
# Last try: we use the IMAGE_PROCESSOR_MAPPING.
|
# Last try: we use the IMAGE_PROCESSOR_MAPPING.
|
||||||
elif type(config) in IMAGE_PROCESSOR_MAPPING:
|
elif type(config) in IMAGE_PROCESSOR_MAPPING:
|
||||||
image_processor_class = IMAGE_PROCESSOR_MAPPING[type(config)]
|
image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)]
|
||||||
return image_processor_class.from_dict(config_dict, **kwargs)
|
|
||||||
|
image_processor_class_py, image_processor_class_fast = image_processor_tuple
|
||||||
|
|
||||||
|
if not use_fast and image_processor_class_fast is not None:
|
||||||
|
_warning_fast_image_processor_available(image_processor_class_fast)
|
||||||
|
|
||||||
|
if image_processor_class_fast and (use_fast or image_processor_class_py is None):
|
||||||
|
return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||||
|
else:
|
||||||
|
if image_processor_class_py is not None:
|
||||||
|
return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
|
||||||
|
)
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
|
f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
|
||||||
@@ -417,7 +487,13 @@ class AutoImageProcessor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def register(config_class, image_processor_class, exist_ok=False):
|
def register(
|
||||||
|
config_class,
|
||||||
|
image_processor_class=None,
|
||||||
|
slow_image_processor_class=None,
|
||||||
|
fast_image_processor_class=None,
|
||||||
|
exist_ok=False,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Register a new image processor for this class.
|
Register a new image processor for this class.
|
||||||
|
|
||||||
@@ -426,4 +502,43 @@ class AutoImageProcessor:
|
|||||||
The configuration corresponding to the model to register.
|
The configuration corresponding to the model to register.
|
||||||
image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
|
image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
|
||||||
"""
|
"""
|
||||||
IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class, exist_ok=exist_ok)
|
if image_processor_class is not None:
|
||||||
|
if slow_image_processor_class is not None:
|
||||||
|
raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class")
|
||||||
|
warnings.warn(
|
||||||
|
"The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
slow_image_processor_class = image_processor_class
|
||||||
|
|
||||||
|
if slow_image_processor_class is None and fast_image_processor_class is None:
|
||||||
|
raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
|
||||||
|
if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
|
||||||
|
raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
|
||||||
|
if fast_image_processor_class is not None and issubclass(fast_image_processor_class, BaseImageProcessor):
|
||||||
|
raise ValueError("You passed a slow image processor in as the `fast_image_processor_class`.")
|
||||||
|
|
||||||
|
if (
|
||||||
|
slow_image_processor_class is not None
|
||||||
|
and fast_image_processor_class is not None
|
||||||
|
and issubclass(fast_image_processor_class, BaseImageProcessorFast)
|
||||||
|
and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"The fast processor class you are passing has a `slow_image_processor_class` attribute that is not "
|
||||||
|
"consistent with the slow processor class you passed (fast tokenizer has "
|
||||||
|
f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those "
|
||||||
|
"so they match!"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Avoid resetting a set slow/fast image processor if we are passing just the other ones.
|
||||||
|
if config_class in IMAGE_PROCESSOR_MAPPING._extra_content:
|
||||||
|
existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class]
|
||||||
|
if slow_image_processor_class is None:
|
||||||
|
slow_image_processor_class = existing_slow
|
||||||
|
if fast_image_processor_class is None:
|
||||||
|
fast_image_processor_class = existing_fast
|
||||||
|
|
||||||
|
IMAGE_PROCESSOR_MAPPING.register(
|
||||||
|
config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok
|
||||||
|
)
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from ...utils import (
|
|||||||
is_flax_available,
|
is_flax_available,
|
||||||
is_tf_available,
|
is_tf_available,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
|
is_torchvision_available,
|
||||||
is_vision_available,
|
is_vision_available,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -34,6 +35,15 @@ else:
|
|||||||
_import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
|
_import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
|
||||||
_import_structure["image_processing_vit"] = ["ViTImageProcessor"]
|
_import_structure["image_processing_vit"] = ["ViTImageProcessor"]
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not is_torchvision_available():
|
||||||
|
raise OptionalDependencyNotAvailable()
|
||||||
|
except OptionalDependencyNotAvailable:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
_import_structure["image_processing_vit_fast"] = ["ViTImageProcessorFast"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
raise OptionalDependencyNotAvailable()
|
raise OptionalDependencyNotAvailable()
|
||||||
@@ -83,6 +93,14 @@ if TYPE_CHECKING:
|
|||||||
from .feature_extraction_vit import ViTFeatureExtractor
|
from .feature_extraction_vit import ViTFeatureExtractor
|
||||||
from .image_processing_vit import ViTImageProcessor
|
from .image_processing_vit import ViTImageProcessor
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not is_torchvision_available():
|
||||||
|
raise OptionalDependencyNotAvailable()
|
||||||
|
except OptionalDependencyNotAvailable:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
from .image_processing_vit_fast import ViTImageProcessorFast
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
raise OptionalDependencyNotAvailable()
|
raise OptionalDependencyNotAvailable()
|
||||||
|
|||||||
289
src/transformers/models/vit/image_processing_vit_fast.py
Normal file
289
src/transformers/models/vit/image_processing_vit_fast.py
Normal file
@@ -0,0 +1,289 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Fast Image processor class for ViT."""
|
||||||
|
|
||||||
|
import functools
|
||||||
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
|
from ...image_processing_base import BatchFeature
|
||||||
|
from ...image_processing_utils import get_size_dict
|
||||||
|
from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
|
||||||
|
from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
|
||||||
|
from ...image_utils import (
|
||||||
|
IMAGENET_STANDARD_MEAN,
|
||||||
|
IMAGENET_STANDARD_STD,
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
ImageType,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_type,
|
||||||
|
make_list_of_images,
|
||||||
|
pil_torch_interpolation_mapping,
|
||||||
|
)
|
||||||
|
from ...utils import TensorType, logging
|
||||||
|
from ...utils.import_utils import is_torch_available, is_torchvision_available
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
if is_torchvision_available():
|
||||||
|
from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
|
||||||
|
|
||||||
|
|
||||||
|
class ViTImageProcessorFast(BaseImageProcessorFast):
|
||||||
|
r"""
|
||||||
|
Constructs a ViT image processor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
|
||||||
|
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
|
||||||
|
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||||
|
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
|
||||||
|
method.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||||
|
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||||
|
`preprocess` method.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
|
||||||
|
parameter in the `preprocess` method.
|
||||||
|
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||||
|
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||||
|
`preprocess` method.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||||
|
method.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
|
||||||
|
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||||
|
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
|
||||||
|
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||||
|
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_input_names = ["pixel_values"]
|
||||||
|
_transform_params = [
|
||||||
|
"do_resize",
|
||||||
|
"do_rescale",
|
||||||
|
"do_normalize",
|
||||||
|
"size",
|
||||||
|
"resample",
|
||||||
|
"rescale_factor",
|
||||||
|
"image_mean",
|
||||||
|
"image_std",
|
||||||
|
"image_type",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_resize: bool = True,
|
||||||
|
size: Optional[Dict[str, int]] = None,
|
||||||
|
resample: PILImageResampling = PILImageResampling.BILINEAR,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: Union[int, float] = 1 / 255,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
size = size if size is not None else {"height": 224, "width": 224}
|
||||||
|
size = get_size_dict(size)
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.size = size
|
||||||
|
self.resample = resample
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
|
||||||
|
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
|
||||||
|
self._transform_settings = {}
|
||||||
|
|
||||||
|
def _build_transforms(
|
||||||
|
self,
|
||||||
|
do_resize: bool,
|
||||||
|
size: Dict[str, int],
|
||||||
|
resample: PILImageResampling,
|
||||||
|
do_rescale: bool,
|
||||||
|
rescale_factor: float,
|
||||||
|
do_normalize: bool,
|
||||||
|
image_mean: Union[float, List[float]],
|
||||||
|
image_std: Union[float, List[float]],
|
||||||
|
image_type: ImageType,
|
||||||
|
) -> "Compose":
|
||||||
|
"""
|
||||||
|
Given the input settings build the image transforms using `torchvision.transforms.Compose`.
|
||||||
|
"""
|
||||||
|
transforms = []
|
||||||
|
|
||||||
|
# All PIL and numpy values need to be converted to a torch tensor
|
||||||
|
# to keep cross compatibility with slow image processors
|
||||||
|
if image_type == ImageType.PIL:
|
||||||
|
transforms.append(PILToTensor())
|
||||||
|
|
||||||
|
elif image_type == ImageType.NUMPY:
|
||||||
|
transforms.append(NumpyToTensor())
|
||||||
|
|
||||||
|
if do_resize:
|
||||||
|
transforms.append(
|
||||||
|
Resize((size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample])
|
||||||
|
)
|
||||||
|
|
||||||
|
# We can combine rescale and normalize into a single operation for speed
|
||||||
|
if do_rescale and do_normalize:
|
||||||
|
transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
|
||||||
|
elif do_rescale:
|
||||||
|
transforms.append(Rescale(rescale_factor=rescale_factor))
|
||||||
|
elif do_normalize:
|
||||||
|
transforms.append(Normalize(image_mean, image_std))
|
||||||
|
|
||||||
|
return Compose(transforms)
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=1)
|
||||||
|
def _validate_input_arguments(
|
||||||
|
self,
|
||||||
|
return_tensors: Union[str, TensorType],
|
||||||
|
do_resize: bool,
|
||||||
|
size: Dict[str, int],
|
||||||
|
resample: PILImageResampling,
|
||||||
|
do_rescale: bool,
|
||||||
|
rescale_factor: float,
|
||||||
|
do_normalize: bool,
|
||||||
|
image_mean: Union[float, List[float]],
|
||||||
|
image_std: Union[float, List[float]],
|
||||||
|
data_format: Union[str, ChannelDimension],
|
||||||
|
image_type: ImageType,
|
||||||
|
):
|
||||||
|
if return_tensors != "pt":
|
||||||
|
raise ValueError("Only returning PyTorch tensors is currently supported.")
|
||||||
|
|
||||||
|
if data_format != ChannelDimension.FIRST:
|
||||||
|
raise ValueError("Only channel first data format is currently supported.")
|
||||||
|
|
||||||
|
if do_resize and None in (size, resample):
|
||||||
|
raise ValueError("Size and resample must be specified if do_resize is True.")
|
||||||
|
|
||||||
|
if do_rescale and rescale_factor is None:
|
||||||
|
raise ValueError("Rescale factor must be specified if do_rescale is True.")
|
||||||
|
|
||||||
|
if do_normalize and None in (image_mean, image_std):
|
||||||
|
raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Dict[str, int] = None,
|
||||||
|
resample: PILImageResampling = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = "pt",
|
||||||
|
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Preprocess an image or batch of images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||||
|
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether to resize the image.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
|
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
|
||||||
|
resizing.
|
||||||
|
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
|
||||||
|
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
|
||||||
|
an effect if `do_resize` is set to `True`.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether to rescale the image values between [0 - 1].
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
Image mean to use if `do_normalize` is set to `True`.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||||
|
Image standard deviation to use if `do_normalize` is set to `True`.
|
||||||
|
return_tensors (`str` or `TensorType`, *optional*):
|
||||||
|
The type of tensors to return. Only "pt" is supported
|
||||||
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. The following formats are currently supported:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||||
|
from the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
"""
|
||||||
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
size = size if size is not None else self.size
|
||||||
|
# Make hashable for cache
|
||||||
|
size = SizeDict(**size)
|
||||||
|
image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
|
||||||
|
image_std = tuple(image_std) if isinstance(image_std, list) else image_std
|
||||||
|
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
image_type = get_image_type(images[0])
|
||||||
|
|
||||||
|
if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
|
||||||
|
raise ValueError(f"Unsupported input image type {image_type}")
|
||||||
|
|
||||||
|
self._validate_input_arguments(
|
||||||
|
do_resize=do_resize,
|
||||||
|
size=size,
|
||||||
|
resample=resample,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
data_format=data_format,
|
||||||
|
image_type=image_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
transforms = self.get_transforms(
|
||||||
|
do_resize=do_resize,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
size=size,
|
||||||
|
resample=resample,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
image_type=image_type,
|
||||||
|
)
|
||||||
|
transformed_images = [transforms(image) for image in images]
|
||||||
|
|
||||||
|
data = {"pixel_values": torch.vstack(transformed_images)}
|
||||||
|
return BatchFeature(data, tensor_type=return_tensors)
|
||||||
16
src/transformers/utils/dummy_torchvision_objects.py
Normal file
16
src/transformers/utils/dummy_torchvision_objects.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# This file is autogenerated by the command `make fix-copies`, do not edit.
|
||||||
|
from ..utils import DummyObject, requires_backends
|
||||||
|
|
||||||
|
|
||||||
|
class BaseImageProcessorFast(metaclass=DummyObject):
|
||||||
|
_backends = ["torchvision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["torchvision"])
|
||||||
|
|
||||||
|
|
||||||
|
class ViTImageProcessorFast(metaclass=DummyObject):
|
||||||
|
_backends = ["torchvision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["torchvision"])
|
||||||
@@ -9,6 +9,13 @@ class ImageProcessingMixin(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class BaseImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class ImageFeatureExtractionMixin(metaclass=DummyObject):
|
class ImageFeatureExtractionMixin(metaclass=DummyObject):
|
||||||
_backends = ["vision"]
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
|||||||
@@ -27,8 +27,10 @@ from transformers import (
|
|||||||
AutoImageProcessor,
|
AutoImageProcessor,
|
||||||
CLIPConfig,
|
CLIPConfig,
|
||||||
CLIPImageProcessor,
|
CLIPImageProcessor,
|
||||||
|
ViTImageProcessor,
|
||||||
|
ViTImageProcessorFast,
|
||||||
)
|
)
|
||||||
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
|
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
|
||||||
|
|
||||||
|
|
||||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||||
@@ -133,6 +135,23 @@ class AutoImageProcessorTest(unittest.TestCase):
|
|||||||
):
|
):
|
||||||
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
|
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
|
||||||
|
|
||||||
|
@require_vision
|
||||||
|
@require_torchvision
|
||||||
|
def test_use_fast_selection(self):
|
||||||
|
checkpoint = "hf-internal-testing/tiny-random-vit"
|
||||||
|
|
||||||
|
# Slow image processor is selected by default
|
||||||
|
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
|
||||||
|
self.assertIsInstance(image_processor, ViTImageProcessor)
|
||||||
|
|
||||||
|
# Fast image processor is selected when use_fast=True
|
||||||
|
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
|
||||||
|
self.assertIsInstance(image_processor, ViTImageProcessorFast)
|
||||||
|
|
||||||
|
# Slow image processor is selected when use_fast=False
|
||||||
|
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
|
||||||
|
self.assertIsInstance(image_processor, ViTImageProcessor)
|
||||||
|
|
||||||
def test_from_pretrained_dynamic_image_processor(self):
|
def test_from_pretrained_dynamic_image_processor(self):
|
||||||
# If remote code is not set, we will time out when asking whether to load the model.
|
# If remote code is not set, we will time out when asking whether to load the model.
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
|
|||||||
@@ -121,6 +121,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = BeitImageProcessor if is_vision_available() else None
|
image_processing_class = BeitImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = BeitImageProcessingTester(self)
|
self.image_processor_tester = BeitImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = BlipImageProcessor if is_vision_available() else None
|
image_processing_class = BlipImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = BlipImageProcessingTester(self)
|
self.image_processor_tester = BlipImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -112,6 +113,7 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
|
|||||||
image_processing_class = BlipImageProcessor if is_vision_available() else None
|
image_processing_class = BlipImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
|
self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
|
||||||
self.expected_encoded_image_num_channels = 3
|
self.expected_encoded_image_num_channels = 3
|
||||||
|
|
||||||
|
|||||||
@@ -136,6 +136,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
|||||||
image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
|
image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = BridgeTowerImageProcessingTester(self)
|
self.image_processor_tester = BridgeTowerImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -98,6 +98,7 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
|||||||
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
|
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
|
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -135,6 +136,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
|
|||||||
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
|
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
|
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
|
||||||
self.expected_encoded_image_num_channels = 3
|
self.expected_encoded_image_num_channels = 3
|
||||||
|
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = CLIPImageProcessor if is_vision_available() else None
|
image_processing_class = CLIPImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = CLIPImageProcessingTester(self)
|
self.image_processor_tester = CLIPImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -131,6 +131,7 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
|
|||||||
image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
|
image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
|
self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ class ConvNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = ConvNextImageProcessor if is_vision_available() else None
|
image_processing_class = ConvNextImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ConvNextImageProcessingTester(self)
|
self.image_processor_tester = ConvNextImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -131,6 +131,7 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
|
|||||||
image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
|
image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = DeformableDetrImageProcessingTester(self)
|
self.image_processor_tester = DeformableDetrImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
test_cast_dtype = True
|
test_cast_dtype = True
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = DeiTImageProcessingTester(self)
|
self.image_processor_tester = DeiTImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -130,6 +130,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
|||||||
image_processing_class = DetrImageProcessor if is_vision_available() else None
|
image_processing_class = DetrImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = DetrImageProcessingTester(self)
|
self.image_processor_tester = DetrImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = DonutImageProcessor if is_vision_available() else None
|
image_processing_class = DonutImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = DonutImageProcessingTester(self)
|
self.image_processor_tester = DonutImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = DPTImageProcessor if is_vision_available() else None
|
image_processing_class = DPTImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = DPTImageProcessingTester(self)
|
self.image_processor_tester = DPTImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ class EfficientNetImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase
|
|||||||
image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
|
image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = EfficientNetImageProcessorTester(self)
|
self.image_processor_tester = EfficientNetImageProcessorTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -175,6 +175,7 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
maxDiff = None
|
maxDiff = None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = FlavaImageProcessingTester(self)
|
self.image_processor_tester = FlavaImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = GLPNImageProcessor if is_vision_available() else None
|
image_processing_class = GLPNImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = GLPNImageProcessingTester(self)
|
self.image_processor_tester = GLPNImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -146,6 +146,7 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
|
|||||||
image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
|
image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = GroundingDinoImageProcessingTester(self)
|
self.image_processor_tester = GroundingDinoImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -127,6 +127,7 @@ class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = IdeficsImageProcessor if is_vision_available() else None
|
image_processing_class = IdeficsImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = IdeficsImageProcessingTester(self)
|
self.image_processor_tester = IdeficsImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -185,6 +185,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
|
image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = Idefics2ImageProcessingTester(self)
|
self.image_processor_tester = Idefics2ImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
from transformers.testing_utils import require_torch, require_vision, slow
|
from transformers import AutoImageProcessor
|
||||||
|
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||||
@@ -96,6 +97,7 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
|
image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ImageGPTImageProcessingTester(self)
|
self.image_processor_tester = ImageGPTImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -141,18 +143,38 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
self.assertEqual(image_processor_first[key], value)
|
self.assertEqual(image_processor_first[key], value)
|
||||||
|
|
||||||
def test_image_processor_from_and_save_pretrained(self):
|
def test_image_processor_from_and_save_pretrained(self):
|
||||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
image_processor_first.save_pretrained(tmpdirname)
|
image_processor_first.save_pretrained(tmpdirname)
|
||||||
image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
|
image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
|
||||||
|
|
||||||
image_processor_first = image_processor_first.to_dict()
|
image_processor_first = image_processor_first.to_dict()
|
||||||
for key, value in image_processor_first.items():
|
for key, value in image_processor_first.items():
|
||||||
if key == "clusters":
|
if key == "clusters":
|
||||||
self.assertTrue(np.array_equal(value, image_processor_second[key]))
|
self.assertTrue(np.array_equal(value, image_processor_second[key]))
|
||||||
else:
|
else:
|
||||||
self.assertEqual(image_processor_first[key], value)
|
self.assertEqual(image_processor_first[key], value)
|
||||||
|
|
||||||
|
def test_image_processor_save_load_with_autoimageprocessor(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||||
|
check_json_file_has_correct_format(saved_file)
|
||||||
|
|
||||||
|
image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
image_processor_first = image_processor_first.to_dict()
|
||||||
|
image_processor_second = image_processor_second.to_dict()
|
||||||
|
|
||||||
|
for key, value in image_processor_first.items():
|
||||||
|
if key == "clusters":
|
||||||
|
self.assertTrue(np.array_equal(value, image_processor_second[key]))
|
||||||
|
else:
|
||||||
|
self.assertEqual(image_processor_first[key], value)
|
||||||
|
|
||||||
@unittest.skip("ImageGPT requires clusters at initialization")
|
@unittest.skip("ImageGPT requires clusters at initialization")
|
||||||
def test_init_without_params(self):
|
def test_init_without_params(self):
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
|
image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = LayoutLMv2ImageProcessingTester(self)
|
self.image_processor_tester = LayoutLMv2ImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
|
image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = LayoutLMv3ImageProcessingTester(self)
|
self.image_processor_tester = LayoutLMv3ImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -91,6 +91,7 @@ class LevitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = LevitImageProcessor if is_vision_available() else None
|
image_processing_class = LevitImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = LevitImageProcessingTester(self)
|
self.image_processor_tester = LevitImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -105,6 +105,7 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
|
|
||||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
|
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = LlavaNextImageProcessingTester(self)
|
self.image_processor_tester = LlavaNextImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -149,6 +149,7 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
|||||||
image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
|
image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = Mask2FormerImageProcessingTester(self)
|
self.image_processor_tester = Mask2FormerImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -149,6 +149,7 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
|
image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = MaskFormerImageProcessingTester(self)
|
self.image_processor_tester = MaskFormerImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ class MobileNetV1ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
|||||||
image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
|
image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
|
self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ class MobileNetV2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
|||||||
image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
|
image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
|
self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -112,6 +112,7 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = MobileViTImageProcessor if is_vision_available() else None
|
image_processing_class = MobileViTImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = MobileViTImageProcessingTester(self)
|
self.image_processor_tester = MobileViTImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = NougatImageProcessor if is_vision_available() else None
|
image_processing_class = NougatImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = NougatImageProcessingTester(self)
|
self.image_processor_tester = NougatImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -159,6 +159,7 @@ class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = image_processing_class
|
image_processing_class = image_processing_class
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = OneFormerImageProcessorTester(self)
|
self.image_processor_tester = OneFormerImageProcessorTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
|
image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = Owlv2ImageProcessingTester(self)
|
self.image_processor_tester = Owlv2ImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -92,6 +92,7 @@ class OwlViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = OwlViTImageProcessor if is_vision_available() else None
|
image_processing_class = OwlViTImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = OwlViTImageProcessingTester(self)
|
self.image_processor_tester = OwlViTImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
|
image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = Pix2StructImageProcessingTester(self)
|
self.image_processor_tester = Pix2StructImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -288,6 +289,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
|
|||||||
image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
|
image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
|
self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
|
||||||
self.expected_encoded_image_num_channels = 3
|
self.expected_encoded_image_num_channels = 3
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ class PoolFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
|
image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = PoolFormerImageProcessingTester(self)
|
self.image_processor_tester = PoolFormerImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ class PvtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = PvtImageProcessor if is_vision_available() else None
|
image_processing_class = PvtImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = PvtImageProcessingTester(self)
|
self.image_processor_tester = PvtImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -112,6 +112,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = SegformerImageProcessor if is_vision_available() else None
|
image_processing_class = SegformerImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = SegformerImageProcessingTester(self)
|
self.image_processor_tester = SegformerImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -114,6 +114,7 @@ class SegGptImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = SegGptImageProcessor if is_vision_available() else None
|
image_processing_class = SegGptImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = SegGptImageProcessingTester(self)
|
self.image_processor_tester = SegGptImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -91,6 +91,7 @@ class SiglipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = SiglipImageProcessor if is_vision_available() else None
|
image_processing_class = SiglipImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = SiglipImageProcessingTester(self)
|
self.image_processor_tester = SiglipImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -77,6 +77,7 @@ class SuperPointImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
image_processing_class = SuperPointImageProcessor if is_vision_available() else None
|
image_processing_class = SuperPointImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self) -> None:
|
def setUp(self) -> None:
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = SuperPointImageProcessingTester(self)
|
self.image_processor_tester = SuperPointImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -98,6 +98,7 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
|
image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = Swin2SRImageProcessingTester(self)
|
self.image_processor_tester = Swin2SRImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -127,6 +127,7 @@ class TvpImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = TvpImageProcessor if is_vision_available() else None
|
image_processing_class = TvpImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = TvpImageProcessingTester(self)
|
self.image_processor_tester = TvpImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ class VideoLlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
|||||||
|
|
||||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
|
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = VideoLlavaImageProcessingTester(self)
|
self.image_processor_tester = VideoLlavaImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
|
image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = VideoMAEImageProcessingTester(self)
|
self.image_processor_tester = VideoMAEImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -130,6 +130,7 @@ class ViltImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = ViltImageProcessor if is_vision_available() else None
|
image_processing_class = ViltImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ViltImageProcessingTester(self)
|
self.image_processor_tester = ViltImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = ViTImageProcessor if is_vision_available() else None
|
image_processing_class = ViTImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = ViTImageProcessingTester(self)
|
self.image_processor_tester = ViTImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -91,16 +92,18 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
return self.image_processor_tester.prepare_image_processor_dict()
|
return self.image_processor_tester.prepare_image_processor_dict()
|
||||||
|
|
||||||
def test_image_processor_properties(self):
|
def test_image_processor_properties(self):
|
||||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||||
self.assertTrue(hasattr(image_processing, "size"))
|
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||||
|
self.assertTrue(hasattr(image_processing, "size"))
|
||||||
|
|
||||||
def test_image_processor_from_dict_with_kwargs(self):
|
def test_image_processor_from_dict_with_kwargs(self):
|
||||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||||
|
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
||||||
|
|
||||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
|
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = VitMatteImageProcessor if is_vision_available() else None
|
image_processing_class = VitMatteImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = VitMatteImageProcessingTester(self)
|
self.image_processor_tester = VitMatteImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image_processing_class = VivitImageProcessor if is_vision_available() else None
|
image_processing_class = VivitImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = VivitImageProcessingTester(self)
|
self.image_processor_tester = VivitImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -143,6 +143,7 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
|||||||
image_processing_class = YolosImageProcessor if is_vision_available() else None
|
image_processing_class = YolosImageProcessor if is_vision_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
self.image_processor_tester = YolosImageProcessingTester(self)
|
self.image_processor_tester = YolosImageProcessingTester(self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -19,7 +19,9 @@ import os
|
|||||||
import pathlib
|
import pathlib
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
from transformers import BatchFeature
|
import requests
|
||||||
|
|
||||||
|
from transformers import AutoImageProcessor, BatchFeature
|
||||||
from transformers.image_utils import AnnotationFormat, AnnotionFormat
|
from transformers.image_utils import AnnotationFormat, AnnotionFormat
|
||||||
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
|
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
@@ -129,176 +131,263 @@ def prepare_video_inputs(
|
|||||||
|
|
||||||
class ImageProcessingTestMixin:
|
class ImageProcessingTestMixin:
|
||||||
test_cast_dtype = None
|
test_cast_dtype = None
|
||||||
|
image_processing_class = None
|
||||||
|
fast_image_processing_class = None
|
||||||
|
image_processors_list = None
|
||||||
|
test_slow_image_processor = True
|
||||||
|
test_fast_image_processor = True
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
image_processor_list = []
|
||||||
|
|
||||||
|
if self.test_slow_image_processor and self.image_processing_class:
|
||||||
|
image_processor_list.append(self.image_processing_class)
|
||||||
|
|
||||||
|
if self.test_fast_image_processor and self.fast_image_processing_class:
|
||||||
|
image_processor_list.append(self.fast_image_processing_class)
|
||||||
|
|
||||||
|
self.image_processor_list = image_processor_list
|
||||||
|
|
||||||
|
@require_vision
|
||||||
|
@require_torch
|
||||||
|
def test_slow_fast_equivalence(self):
|
||||||
|
dummy_image = Image.open(
|
||||||
|
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||||
|
self.skipTest("Skipping slow/fast equivalence test")
|
||||||
|
|
||||||
|
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||||
|
self.skipTest("Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||||
|
|
||||||
|
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||||
|
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
|
||||||
|
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||||
|
|
||||||
|
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-3))
|
||||||
|
|
||||||
|
@require_vision
|
||||||
|
@require_torch
|
||||||
|
def test_fast_is_faster_than_slow(self):
|
||||||
|
import time
|
||||||
|
|
||||||
|
def measure_time(self, image_processor, dummy_image):
|
||||||
|
start = time.time()
|
||||||
|
_ = image_processor(dummy_image, return_tensors="pt")
|
||||||
|
return time.time() - start
|
||||||
|
|
||||||
|
dummy_image = Image.open(
|
||||||
|
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||||
|
self.skipTest("Skipping speed test")
|
||||||
|
|
||||||
|
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||||
|
self.skipTest("Skipping speed test as one of the image processors is not defined")
|
||||||
|
|
||||||
|
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||||
|
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
slow_time = self.measure_time(image_processor_slow, dummy_image)
|
||||||
|
fast_time = self.measure_time(image_processor_fast, dummy_image)
|
||||||
|
|
||||||
|
self.assertLessEqual(fast_time, slow_time)
|
||||||
|
|
||||||
def test_image_processor_to_json_string(self):
|
def test_image_processor_to_json_string(self):
|
||||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
obj = json.loads(image_processor.to_json_string())
|
image_processor = image_processing_class(**self.image_processor_dict)
|
||||||
for key, value in self.image_processor_dict.items():
|
obj = json.loads(image_processor.to_json_string())
|
||||||
self.assertEqual(obj[key], value)
|
for key, value in self.image_processor_dict.items():
|
||||||
|
self.assertEqual(obj[key], value)
|
||||||
|
|
||||||
def test_image_processor_to_json_file(self):
|
def test_image_processor_to_json_file(self):
|
||||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
json_file_path = os.path.join(tmpdirname, "image_processor.json")
|
json_file_path = os.path.join(tmpdirname, "image_processor.json")
|
||||||
image_processor_first.to_json_file(json_file_path)
|
image_processor_first.to_json_file(json_file_path)
|
||||||
image_processor_second = self.image_processing_class.from_json_file(json_file_path)
|
image_processor_second = image_processing_class.from_json_file(json_file_path)
|
||||||
|
|
||||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||||
|
|
||||||
def test_image_processor_from_and_save_pretrained(self):
|
def test_image_processor_from_and_save_pretrained(self):
|
||||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||||
check_json_file_has_correct_format(saved_file)
|
check_json_file_has_correct_format(saved_file)
|
||||||
image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
|
image_processor_second = image_processing_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||||
|
|
||||||
|
def test_image_processor_save_load_with_autoimageprocessor(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||||
|
check_json_file_has_correct_format(saved_file)
|
||||||
|
|
||||||
|
image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||||
|
|
||||||
def test_init_without_params(self):
|
def test_init_without_params(self):
|
||||||
image_processor = self.image_processing_class()
|
for image_processing_class in self.image_processor_list:
|
||||||
self.assertIsNotNone(image_processor)
|
image_processor = image_processing_class()
|
||||||
|
self.assertIsNotNone(image_processor)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_vision
|
@require_vision
|
||||||
def test_cast_dtype_device(self):
|
def test_cast_dtype_device(self):
|
||||||
if self.test_cast_dtype is not None:
|
for image_processing_class in self.image_processor_list:
|
||||||
# Initialize image_processor
|
if self.test_cast_dtype is not None:
|
||||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
# Initialize image_processor
|
||||||
|
image_processor = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
# create random PyTorch tensors
|
||||||
|
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||||
|
|
||||||
|
encoding = image_processor(image_inputs, return_tensors="pt")
|
||||||
|
# for layoutLM compatiblity
|
||||||
|
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||||
|
self.assertEqual(encoding.pixel_values.dtype, torch.float32)
|
||||||
|
|
||||||
|
encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
|
||||||
|
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||||
|
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
||||||
|
|
||||||
|
encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
|
||||||
|
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||||
|
self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
|
||||||
|
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
_ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
|
||||||
|
|
||||||
|
# Try with text + image feature
|
||||||
|
encoding = image_processor(image_inputs, return_tensors="pt")
|
||||||
|
encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
|
||||||
|
encoding = encoding.to(torch.float16)
|
||||||
|
|
||||||
|
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||||
|
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
||||||
|
self.assertEqual(encoding.input_ids.dtype, torch.long)
|
||||||
|
|
||||||
|
def test_call_pil(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
# Initialize image_processing
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
# create random PIL images
|
||||||
|
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||||
|
for image in image_inputs:
|
||||||
|
self.assertIsInstance(image, Image.Image)
|
||||||
|
|
||||||
|
# Test not batched input
|
||||||
|
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||||
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||||
|
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||||
|
|
||||||
|
# Test batched
|
||||||
|
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||||
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||||
|
self.assertEqual(
|
||||||
|
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_call_numpy(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
# Initialize image_processing
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
# create random numpy tensors
|
||||||
|
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||||
|
for image in image_inputs:
|
||||||
|
self.assertIsInstance(image, np.ndarray)
|
||||||
|
|
||||||
|
# Test not batched input
|
||||||
|
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||||
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||||
|
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||||
|
|
||||||
|
# Test batched
|
||||||
|
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||||
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||||
|
self.assertEqual(
|
||||||
|
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_call_pytorch(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
# Initialize image_processing
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
# create random PyTorch tensors
|
# create random PyTorch tensors
|
||||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||||
|
|
||||||
encoding = image_processor(image_inputs, return_tensors="pt")
|
for image in image_inputs:
|
||||||
# for layoutLM compatiblity
|
self.assertIsInstance(image, torch.Tensor)
|
||||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
|
||||||
self.assertEqual(encoding.pixel_values.dtype, torch.float32)
|
|
||||||
|
|
||||||
encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
|
# Test not batched input
|
||||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||||
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||||
|
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||||
|
|
||||||
encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
|
# Test batched
|
||||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||||
self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
|
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||||
|
self.assertEqual(
|
||||||
with self.assertRaises(TypeError):
|
tuple(encoded_images.shape),
|
||||||
_ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
|
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
||||||
|
)
|
||||||
# Try with text + image feature
|
|
||||||
encoding = image_processor(image_inputs, return_tensors="pt")
|
|
||||||
encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
|
|
||||||
encoding = encoding.to(torch.float16)
|
|
||||||
|
|
||||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
|
||||||
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
|
||||||
self.assertEqual(encoding.input_ids.dtype, torch.long)
|
|
||||||
|
|
||||||
def test_call_pil(self):
|
|
||||||
# Initialize image_processing
|
|
||||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
|
||||||
# create random PIL images
|
|
||||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
|
||||||
for image in image_inputs:
|
|
||||||
self.assertIsInstance(image, Image.Image)
|
|
||||||
|
|
||||||
# Test not batched input
|
|
||||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
|
||||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
|
||||||
|
|
||||||
# Test batched
|
|
||||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
|
||||||
self.assertEqual(
|
|
||||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_call_numpy(self):
|
|
||||||
# Initialize image_processing
|
|
||||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
|
||||||
# create random numpy tensors
|
|
||||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
|
||||||
for image in image_inputs:
|
|
||||||
self.assertIsInstance(image, np.ndarray)
|
|
||||||
|
|
||||||
# Test not batched input
|
|
||||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
|
||||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
|
||||||
|
|
||||||
# Test batched
|
|
||||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
|
||||||
self.assertEqual(
|
|
||||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_call_pytorch(self):
|
|
||||||
# Initialize image_processing
|
|
||||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
|
||||||
# create random PyTorch tensors
|
|
||||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
|
||||||
|
|
||||||
for image in image_inputs:
|
|
||||||
self.assertIsInstance(image, torch.Tensor)
|
|
||||||
|
|
||||||
# Test not batched input
|
|
||||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
|
||||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
|
||||||
|
|
||||||
# Test batched
|
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
|
||||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
|
||||||
self.assertEqual(
|
|
||||||
tuple(encoded_images.shape),
|
|
||||||
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_call_numpy_4_channels(self):
|
def test_call_numpy_4_channels(self):
|
||||||
# Test that can process images which have an arbitrary number of channels
|
for image_processing_class in self.image_processor_list:
|
||||||
# Initialize image_processing
|
# Test that can process images which have an arbitrary number of channels
|
||||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
# Initialize image_processing
|
||||||
|
image_processor = image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
# create random numpy tensors
|
# create random numpy tensors
|
||||||
self.image_processor_tester.num_channels = 4
|
self.image_processor_tester.num_channels = 4
|
||||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||||
|
|
||||||
# Test not batched input
|
# Test not batched input
|
||||||
encoded_images = image_processor(
|
encoded_images = image_processor(
|
||||||
image_inputs[0],
|
image_inputs[0],
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_first",
|
input_data_format="channels_first",
|
||||||
image_mean=0,
|
image_mean=0,
|
||||||
image_std=1,
|
image_std=1,
|
||||||
).pixel_values
|
).pixel_values
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||||
|
|
||||||
# Test batched
|
# Test batched
|
||||||
encoded_images = image_processor(
|
encoded_images = image_processor(
|
||||||
image_inputs,
|
image_inputs,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
input_data_format="channels_first",
|
input_data_format="channels_first",
|
||||||
image_mean=0,
|
image_mean=0,
|
||||||
image_std=1,
|
image_std=1,
|
||||||
).pixel_values
|
).pixel_values
|
||||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_image_processor_preprocess_arguments(self):
|
def test_image_processor_preprocess_arguments(self):
|
||||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
for image_processing_class in self.image_processor_list:
|
||||||
if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
|
image_processor = image_processing_class(**self.image_processor_dict)
|
||||||
preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
|
if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
|
||||||
preprocess_parameter_names.remove("self")
|
preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
|
||||||
preprocess_parameter_names.sort()
|
preprocess_parameter_names.remove("self")
|
||||||
valid_processor_keys = image_processor._valid_processor_keys
|
preprocess_parameter_names.sort()
|
||||||
valid_processor_keys.sort()
|
valid_processor_keys = image_processor._valid_processor_keys
|
||||||
self.assertEqual(preprocess_parameter_names, valid_processor_keys)
|
valid_processor_keys.sort()
|
||||||
|
self.assertEqual(preprocess_parameter_names, valid_processor_keys)
|
||||||
|
|
||||||
|
|
||||||
class AnnotationFormatTestMixin:
|
class AnnotationFormatTestMixin:
|
||||||
|
|||||||
Reference in New Issue
Block a user