Fast image processor (#28847)
* Draft fast image processors * Draft working fast version * py3.8 compatible cache * Enable loading fast image processors through auto * Tidy up; rescale behaviour based on input type * Enable tests for fast image processors * Smarter rescaling * Don't default to Fast * Safer imports * Add necessary Pillow requirement * Woops * Add AutoImageProcessor test * Fix up * Fix test for imagegpt * Fix test * Review comments * Add warning for TF and JAX input types * Rearrange * Return transforms * NumpyToTensor transformation * Rebase - include changes from upstream in ImageProcessingMixin * Safe typing * Fix up * convert mean/std to tesnor to rescale * Don't store transforms in state * Fix up * Update src/transformers/image_processing_utils_fast.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Warn if fast image processor available * Update src/transformers/models/vit/image_processing_vit_fast.py * Transpose incoming numpy images to be in CHW format * Update mapping names based on packages, auto set fast to None * Fix up * Fix * Add AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) test * Update src/transformers/models/vit/image_processing_vit_fast.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Add equivalence and speed tests * Fix up --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
@@ -32,3 +32,8 @@ An image processor is in charge of preparing input features for vision models an
|
||||
## BaseImageProcessor
|
||||
|
||||
[[autodoc]] image_processing_utils.BaseImageProcessor
|
||||
|
||||
|
||||
## BaseImageProcessorFast
|
||||
|
||||
[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
|
||||
|
||||
@@ -62,7 +62,7 @@ Following the original Vision Transformer, some follow-up works have been made:
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
|
||||
found [here](https://github.com/google-research/vision_transformer).
|
||||
|
||||
Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
|
||||
Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
|
||||
who already converted the weights from JAX to PyTorch. Credits go to him!
|
||||
|
||||
## Usage tips
|
||||
@@ -158,6 +158,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
[[autodoc]] ViTImageProcessor
|
||||
- preprocess
|
||||
|
||||
## ViTImageProcessorFast
|
||||
|
||||
[[autodoc]] ViTImageProcessorFast
|
||||
- preprocess
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
|
||||
@@ -29,3 +29,4 @@ timm
|
||||
albumentations >= 1.4.5
|
||||
torchmetrics
|
||||
pycocotools
|
||||
Pillow>=10.0.1,<=15.0
|
||||
|
||||
@@ -1104,7 +1104,8 @@ except OptionalDependencyNotAvailable:
|
||||
name for name in dir(dummy_vision_objects) if not name.startswith("_")
|
||||
]
|
||||
else:
|
||||
_import_structure["image_processing_utils"] = ["ImageProcessingMixin"]
|
||||
_import_structure["image_processing_base"] = ["ImageProcessingMixin"]
|
||||
_import_structure["image_processing_utils"] = ["BaseImageProcessor"]
|
||||
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
|
||||
_import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
|
||||
_import_structure["models.bit"].extend(["BitImageProcessor"])
|
||||
@@ -1167,6 +1168,18 @@ else:
|
||||
_import_structure["models.vivit"].append("VivitImageProcessor")
|
||||
_import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
|
||||
|
||||
try:
|
||||
if not is_torchvision_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
from .utils import dummy_torchvision_objects
|
||||
|
||||
_import_structure["utils.dummy_torchvision_objects"] = [
|
||||
name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
|
||||
]
|
||||
else:
|
||||
_import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
|
||||
_import_structure["models.vit"].append("ViTImageProcessorFast")
|
||||
|
||||
# PyTorch-backed objects
|
||||
try:
|
||||
@@ -5703,7 +5716,8 @@ if TYPE_CHECKING:
|
||||
except OptionalDependencyNotAvailable:
|
||||
from .utils.dummy_vision_objects import *
|
||||
else:
|
||||
from .image_processing_utils import ImageProcessingMixin
|
||||
from .image_processing_base import ImageProcessingMixin
|
||||
from .image_processing_utils import BaseImageProcessor
|
||||
from .image_utils import ImageFeatureExtractionMixin
|
||||
from .models.beit import BeitFeatureExtractor, BeitImageProcessor
|
||||
from .models.bit import BitImageProcessor
|
||||
@@ -5793,6 +5807,15 @@ if TYPE_CHECKING:
|
||||
from .models.vivit import VivitImageProcessor
|
||||
from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
|
||||
|
||||
try:
|
||||
if not is_torchvision_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
from .utils.dummy_torchvision_objects import *
|
||||
else:
|
||||
from .image_processing_utils_fast import BaseImageProcessorFast
|
||||
from .models.vit import ViTImageProcessorFast
|
||||
|
||||
# Modeling
|
||||
try:
|
||||
if not is_torch_available():
|
||||
|
||||
554
src/transformers/image_processing_base.py
Normal file
554
src/transformers/image_processing_base.py
Normal file
@@ -0,0 +1,554 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
|
||||
from .utils import (
|
||||
IMAGE_PROCESSOR_NAME,
|
||||
PushToHubMixin,
|
||||
add_model_info_to_auto_map,
|
||||
add_model_info_to_custom_pipelines,
|
||||
cached_file,
|
||||
copy_func,
|
||||
download_url,
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
is_vision_available,
|
||||
logging,
|
||||
)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
|
||||
# We override the class string here, but logic is the same.
|
||||
class BatchFeature(BaseBatchFeature):
|
||||
r"""
|
||||
Holds the output of the image processor specific `__call__` methods.
|
||||
|
||||
This class is derived from a python dictionary and can be used as a dictionary.
|
||||
|
||||
Args:
|
||||
data (`dict`):
|
||||
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
|
||||
tensor_type (`Union[None, str, TensorType]`, *optional*):
|
||||
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
|
||||
initialization.
|
||||
"""
|
||||
|
||||
|
||||
# TODO: (Amy) - factor out the common parts of this and the feature extractor
|
||||
class ImageProcessingMixin(PushToHubMixin):
|
||||
"""
|
||||
This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
|
||||
extractors.
|
||||
"""
|
||||
|
||||
_auto_class = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Set elements of `kwargs` as attributes."""
|
||||
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
|
||||
# `XXXImageProcessor`, this attribute and its value are misleading.
|
||||
kwargs.pop("feature_extractor_type", None)
|
||||
# Pop "processor_class" as it should be saved as private attribute
|
||||
self._processor_class = kwargs.pop("processor_class", None)
|
||||
# Additional attributes without default values
|
||||
for key, value in kwargs.items():
|
||||
try:
|
||||
setattr(self, key, value)
|
||||
except AttributeError as err:
|
||||
logger.error(f"Can't set {key} with value {value} for {self}")
|
||||
raise err
|
||||
|
||||
def _set_processor_class(self, processor_class: str):
|
||||
"""Sets processor class as an attribute."""
|
||||
self._processor_class = processor_class
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||
force_download: bool = False,
|
||||
local_files_only: bool = False,
|
||||
token: Optional[Union[str, bool]] = None,
|
||||
revision: str = "main",
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||
This can be either:
|
||||
|
||||
- a string, the *model id* of a pretrained image_processor hosted inside a model repo on
|
||||
huggingface.co.
|
||||
- a path to a *directory* containing a image processor file saved using the
|
||||
[`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
|
||||
`./my_model_directory/`.
|
||||
- a path or url to a saved image processor JSON *file*, e.g.,
|
||||
`./my_model_directory/preprocessor_config.json`.
|
||||
cache_dir (`str` or `os.PathLike`, *optional*):
|
||||
Path to a directory in which a downloaded pretrained model image processor should be cached if the
|
||||
standard cache should not be used.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to force to (re-)download the image processor files and override the cached versions if
|
||||
they exist.
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
|
||||
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
|
||||
revision (`str`, *optional*, defaults to `"main"`):
|
||||
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
||||
identifier allowed by git.
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
|
||||
|
||||
</Tip>
|
||||
|
||||
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
||||
If `False`, then this function returns just the final image processor object. If `True`, then this
|
||||
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
||||
consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
|
||||
`kwargs` which has not been used to update `image_processor` and is otherwise ignored.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
The values in kwargs of any keys which are image processor attributes will be used to override the
|
||||
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
|
||||
controlled by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
Returns:
|
||||
A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
# We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
|
||||
# derived class: *CLIPImageProcessor*
|
||||
image_processor = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32"
|
||||
) # Download image_processing_config from huggingface.co and cache.
|
||||
image_processor = CLIPImageProcessor.from_pretrained(
|
||||
"./test/saved_model/"
|
||||
) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
|
||||
image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
|
||||
image_processor = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32", do_normalize=False, foo=False
|
||||
)
|
||||
assert image_processor.do_normalize is False
|
||||
image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
|
||||
)
|
||||
assert image_processor.do_normalize is False
|
||||
assert unused_kwargs == {"foo": False}
|
||||
```"""
|
||||
kwargs["cache_dir"] = cache_dir
|
||||
kwargs["force_download"] = force_download
|
||||
kwargs["local_files_only"] = local_files_only
|
||||
kwargs["revision"] = revision
|
||||
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
if use_auth_token is not None:
|
||||
warnings.warn(
|
||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
if token is not None:
|
||||
raise ValueError(
|
||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||
)
|
||||
token = use_auth_token
|
||||
|
||||
if token is not None:
|
||||
kwargs["token"] = token
|
||||
|
||||
image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
return cls.from_dict(image_processor_dict, **kwargs)
|
||||
|
||||
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
|
||||
"""
|
||||
Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
|
||||
[`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
|
||||
|
||||
Args:
|
||||
save_directory (`str` or `os.PathLike`):
|
||||
Directory where the image processor JSON file will be saved (will be created if it does not exist).
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||
namespace).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
"""
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
|
||||
if use_auth_token is not None:
|
||||
warnings.warn(
|
||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
if kwargs.get("token", None) is not None:
|
||||
raise ValueError(
|
||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||
)
|
||||
kwargs["token"] = use_auth_token
|
||||
|
||||
if os.path.isfile(save_directory):
|
||||
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
|
||||
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
|
||||
if push_to_hub:
|
||||
commit_message = kwargs.pop("commit_message", None)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
repo_id = self._create_repo(repo_id, **kwargs)
|
||||
files_timestamps = self._get_files_timestamps(save_directory)
|
||||
|
||||
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
|
||||
# loaded from the Hub.
|
||||
if self._auto_class is not None:
|
||||
custom_object_save(self, save_directory, config=self)
|
||||
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
|
||||
|
||||
self.to_json_file(output_image_processor_file)
|
||||
logger.info(f"Image processor saved in {output_image_processor_file}")
|
||||
|
||||
if push_to_hub:
|
||||
self._upload_modified_files(
|
||||
save_directory,
|
||||
repo_id,
|
||||
files_timestamps,
|
||||
commit_message=commit_message,
|
||||
token=kwargs.get("token"),
|
||||
)
|
||||
|
||||
return [output_image_processor_file]
|
||||
|
||||
@classmethod
|
||||
def get_image_processor_dict(
|
||||
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
"""
|
||||
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
|
||||
image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
|
||||
Returns:
|
||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
|
||||
"""
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
resume_download = kwargs.pop("resume_download", None)
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
token = kwargs.pop("token", None)
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
revision = kwargs.pop("revision", None)
|
||||
subfolder = kwargs.pop("subfolder", "")
|
||||
|
||||
from_pipeline = kwargs.pop("_from_pipeline", None)
|
||||
from_auto_class = kwargs.pop("_from_auto", False)
|
||||
|
||||
if use_auth_token is not None:
|
||||
warnings.warn(
|
||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
if token is not None:
|
||||
raise ValueError(
|
||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||
)
|
||||
token = use_auth_token
|
||||
|
||||
user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
|
||||
if from_pipeline is not None:
|
||||
user_agent["using_pipeline"] = from_pipeline
|
||||
|
||||
if is_offline_mode() and not local_files_only:
|
||||
logger.info("Offline mode: forcing local_files_only=True")
|
||||
local_files_only = True
|
||||
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
is_local = os.path.isdir(pretrained_model_name_or_path)
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
resolved_image_processor_file = pretrained_model_name_or_path
|
||||
is_local = True
|
||||
elif is_remote_url(pretrained_model_name_or_path):
|
||||
image_processor_file = pretrained_model_name_or_path
|
||||
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
|
||||
else:
|
||||
image_processor_file = IMAGE_PROCESSOR_NAME
|
||||
try:
|
||||
# Load from local folder or from cache or download from model Hub and cache
|
||||
resolved_image_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
image_processor_file,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
)
|
||||
except EnvironmentError:
|
||||
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||
# the original exception.
|
||||
raise
|
||||
except Exception:
|
||||
# For any other exception, we throw a generic error.
|
||||
raise EnvironmentError(
|
||||
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||
f" directory containing a {IMAGE_PROCESSOR_NAME} file"
|
||||
)
|
||||
|
||||
try:
|
||||
# Load image_processor dict
|
||||
with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
image_processor_dict = json.loads(text)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
raise EnvironmentError(
|
||||
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
|
||||
)
|
||||
|
||||
if is_local:
|
||||
logger.info(f"loading configuration file {resolved_image_processor_file}")
|
||||
else:
|
||||
logger.info(
|
||||
f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
|
||||
)
|
||||
|
||||
if not is_local:
|
||||
if "auto_map" in image_processor_dict:
|
||||
image_processor_dict["auto_map"] = add_model_info_to_auto_map(
|
||||
image_processor_dict["auto_map"], pretrained_model_name_or_path
|
||||
)
|
||||
if "custom_pipelines" in image_processor_dict:
|
||||
image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
|
||||
image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
|
||||
)
|
||||
return image_processor_dict, kwargs
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
||||
"""
|
||||
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
|
||||
|
||||
Args:
|
||||
image_processor_dict (`Dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
|
||||
retrieved from a pretrained checkpoint by leveraging the
|
||||
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
Additional parameters from which to initialize the image processor object.
|
||||
|
||||
Returns:
|
||||
[`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
|
||||
parameters.
|
||||
"""
|
||||
image_processor_dict = image_processor_dict.copy()
|
||||
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||
|
||||
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
|
||||
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
|
||||
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
|
||||
if "size" in kwargs and "size" in image_processor_dict:
|
||||
image_processor_dict["size"] = kwargs.pop("size")
|
||||
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
|
||||
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
|
||||
|
||||
image_processor = cls(**image_processor_dict)
|
||||
|
||||
# Update image_processor with kwargs if needed
|
||||
to_remove = []
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(image_processor, key):
|
||||
setattr(image_processor, key, value)
|
||||
to_remove.append(key)
|
||||
for key in to_remove:
|
||||
kwargs.pop(key, None)
|
||||
|
||||
logger.info(f"Image processor {image_processor}")
|
||||
if return_unused_kwargs:
|
||||
return image_processor, kwargs
|
||||
else:
|
||||
return image_processor
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
output["image_processor_type"] = self.__class__.__name__
|
||||
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file: Union[str, os.PathLike]):
|
||||
"""
|
||||
Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
|
||||
file of parameters.
|
||||
|
||||
Args:
|
||||
json_file (`str` or `os.PathLike`):
|
||||
Path to the JSON file containing the parameters.
|
||||
|
||||
Returns:
|
||||
A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
|
||||
instantiated from that JSON file.
|
||||
"""
|
||||
with open(json_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
image_processor_dict = json.loads(text)
|
||||
return cls(**image_processor_dict)
|
||||
|
||||
def to_json_string(self) -> str:
|
||||
"""
|
||||
Serializes this instance to a JSON string.
|
||||
|
||||
Returns:
|
||||
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
|
||||
"""
|
||||
dictionary = self.to_dict()
|
||||
|
||||
for key, value in dictionary.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
dictionary[key] = value.tolist()
|
||||
|
||||
# make sure private name "_processor_class" is correctly
|
||||
# saved as "processor_class"
|
||||
_processor_class = dictionary.pop("_processor_class", None)
|
||||
if _processor_class is not None:
|
||||
dictionary["processor_class"] = _processor_class
|
||||
|
||||
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
|
||||
|
||||
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
||||
"""
|
||||
Save this instance to a JSON file.
|
||||
|
||||
Args:
|
||||
json_file_path (`str` or `os.PathLike`):
|
||||
Path to the JSON file in which this image_processor instance's parameters will be saved.
|
||||
"""
|
||||
with open(json_file_path, "w", encoding="utf-8") as writer:
|
||||
writer.write(self.to_json_string())
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__} {self.to_json_string()}"
|
||||
|
||||
@classmethod
|
||||
def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
|
||||
"""
|
||||
Register this class with a given auto class. This should only be used for custom image processors as the ones
|
||||
in the library are already mapped with `AutoImageProcessor `.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is experimental and may have some slight breaking changes in the next releases.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
|
||||
The auto class to register this new image processor with.
|
||||
"""
|
||||
if not isinstance(auto_class, str):
|
||||
auto_class = auto_class.__name__
|
||||
|
||||
import transformers.models.auto as auto_module
|
||||
|
||||
if not hasattr(auto_module, auto_class):
|
||||
raise ValueError(f"{auto_class} is not a valid auto class.")
|
||||
|
||||
cls._auto_class = auto_class
|
||||
|
||||
def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
|
||||
"""
|
||||
Convert a single or a list of urls into the corresponding `PIL.Image` objects.
|
||||
|
||||
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
|
||||
returned.
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
|
||||
" Safari/537.36"
|
||||
)
|
||||
}
|
||||
if isinstance(image_url_or_urls, list):
|
||||
return [self.fetch_images(x) for x in image_url_or_urls]
|
||||
elif isinstance(image_url_or_urls, str):
|
||||
response = requests.get(image_url_or_urls, stream=True, headers=headers)
|
||||
response.raise_for_status()
|
||||
return Image.open(BytesIO(response.content))
|
||||
else:
|
||||
raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
|
||||
|
||||
|
||||
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
|
||||
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
|
||||
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
|
||||
object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
|
||||
)
|
||||
@@ -13,38 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
from typing import Dict, Iterable, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
|
||||
from .image_processing_base import BatchFeature, ImageProcessingMixin
|
||||
from .image_transforms import center_crop, normalize, rescale
|
||||
from .image_utils import ChannelDimension
|
||||
from .utils import (
|
||||
IMAGE_PROCESSOR_NAME,
|
||||
PushToHubMixin,
|
||||
add_model_info_to_auto_map,
|
||||
add_model_info_to_custom_pipelines,
|
||||
cached_file,
|
||||
copy_func,
|
||||
download_url,
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
is_vision_available,
|
||||
logging,
|
||||
)
|
||||
from .utils import logging
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@@ -54,505 +32,6 @@ INIT_SERVICE_KWARGS = [
|
||||
]
|
||||
|
||||
|
||||
# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
|
||||
# We override the class string here, but logic is the same.
|
||||
class BatchFeature(BaseBatchFeature):
|
||||
r"""
|
||||
Holds the output of the image processor specific `__call__` methods.
|
||||
|
||||
This class is derived from a python dictionary and can be used as a dictionary.
|
||||
|
||||
Args:
|
||||
data (`dict`):
|
||||
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
|
||||
tensor_type (`Union[None, str, TensorType]`, *optional*):
|
||||
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
|
||||
initialization.
|
||||
"""
|
||||
|
||||
|
||||
# TODO: (Amy) - factor out the common parts of this and the feature extractor
|
||||
class ImageProcessingMixin(PushToHubMixin):
|
||||
"""
|
||||
This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
|
||||
extractors.
|
||||
"""
|
||||
|
||||
_auto_class = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Set elements of `kwargs` as attributes."""
|
||||
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
|
||||
# `XXXImageProcessor`, this attribute and its value are misleading.
|
||||
kwargs.pop("feature_extractor_type", None)
|
||||
# Pop "processor_class" as it should be saved as private attribute
|
||||
self._processor_class = kwargs.pop("processor_class", None)
|
||||
# Additional attributes without default values
|
||||
for key, value in kwargs.items():
|
||||
try:
|
||||
setattr(self, key, value)
|
||||
except AttributeError as err:
|
||||
logger.error(f"Can't set {key} with value {value} for {self}")
|
||||
raise err
|
||||
|
||||
def _set_processor_class(self, processor_class: str):
|
||||
"""Sets processor class as an attribute."""
|
||||
self._processor_class = processor_class
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||
force_download: bool = False,
|
||||
local_files_only: bool = False,
|
||||
token: Optional[Union[str, bool]] = None,
|
||||
revision: str = "main",
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||
This can be either:
|
||||
|
||||
- a string, the *model id* of a pretrained image_processor hosted inside a model repo on
|
||||
huggingface.co.
|
||||
- a path to a *directory* containing a image processor file saved using the
|
||||
[`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
|
||||
`./my_model_directory/`.
|
||||
- a path or url to a saved image processor JSON *file*, e.g.,
|
||||
`./my_model_directory/preprocessor_config.json`.
|
||||
cache_dir (`str` or `os.PathLike`, *optional*):
|
||||
Path to a directory in which a downloaded pretrained model image processor should be cached if the
|
||||
standard cache should not be used.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to force to (re-)download the image processor files and override the cached versions if
|
||||
they exist.
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
|
||||
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
|
||||
revision (`str`, *optional*, defaults to `"main"`):
|
||||
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
||||
identifier allowed by git.
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
|
||||
|
||||
</Tip>
|
||||
|
||||
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
||||
If `False`, then this function returns just the final image processor object. If `True`, then this
|
||||
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
||||
consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
|
||||
`kwargs` which has not been used to update `image_processor` and is otherwise ignored.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
The values in kwargs of any keys which are image processor attributes will be used to override the
|
||||
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
|
||||
controlled by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
Returns:
|
||||
A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
# We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
|
||||
# derived class: *CLIPImageProcessor*
|
||||
image_processor = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32"
|
||||
) # Download image_processing_config from huggingface.co and cache.
|
||||
image_processor = CLIPImageProcessor.from_pretrained(
|
||||
"./test/saved_model/"
|
||||
) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
|
||||
image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
|
||||
image_processor = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32", do_normalize=False, foo=False
|
||||
)
|
||||
assert image_processor.do_normalize is False
|
||||
image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
|
||||
"openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
|
||||
)
|
||||
assert image_processor.do_normalize is False
|
||||
assert unused_kwargs == {"foo": False}
|
||||
```"""
|
||||
kwargs["cache_dir"] = cache_dir
|
||||
kwargs["force_download"] = force_download
|
||||
kwargs["local_files_only"] = local_files_only
|
||||
kwargs["revision"] = revision
|
||||
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
if use_auth_token is not None:
|
||||
warnings.warn(
|
||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
if token is not None:
|
||||
raise ValueError(
|
||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||
)
|
||||
token = use_auth_token
|
||||
|
||||
if token is not None:
|
||||
kwargs["token"] = token
|
||||
|
||||
image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
return cls.from_dict(image_processor_dict, **kwargs)
|
||||
|
||||
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
|
||||
"""
|
||||
Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
|
||||
[`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
|
||||
|
||||
Args:
|
||||
save_directory (`str` or `os.PathLike`):
|
||||
Directory where the image processor JSON file will be saved (will be created if it does not exist).
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||
namespace).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
"""
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
|
||||
if use_auth_token is not None:
|
||||
warnings.warn(
|
||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
if kwargs.get("token", None) is not None:
|
||||
raise ValueError(
|
||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||
)
|
||||
kwargs["token"] = use_auth_token
|
||||
|
||||
if os.path.isfile(save_directory):
|
||||
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
|
||||
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
|
||||
if push_to_hub:
|
||||
commit_message = kwargs.pop("commit_message", None)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
repo_id = self._create_repo(repo_id, **kwargs)
|
||||
files_timestamps = self._get_files_timestamps(save_directory)
|
||||
|
||||
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
|
||||
# loaded from the Hub.
|
||||
if self._auto_class is not None:
|
||||
custom_object_save(self, save_directory, config=self)
|
||||
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
|
||||
|
||||
self.to_json_file(output_image_processor_file)
|
||||
logger.info(f"Image processor saved in {output_image_processor_file}")
|
||||
|
||||
if push_to_hub:
|
||||
self._upload_modified_files(
|
||||
save_directory,
|
||||
repo_id,
|
||||
files_timestamps,
|
||||
commit_message=commit_message,
|
||||
token=kwargs.get("token"),
|
||||
)
|
||||
|
||||
return [output_image_processor_file]
|
||||
|
||||
@classmethod
|
||||
def get_image_processor_dict(
|
||||
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
"""
|
||||
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
|
||||
image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
|
||||
Returns:
|
||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
|
||||
"""
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
resume_download = kwargs.pop("resume_download", None)
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
token = kwargs.pop("token", None)
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
revision = kwargs.pop("revision", None)
|
||||
subfolder = kwargs.pop("subfolder", "")
|
||||
|
||||
from_pipeline = kwargs.pop("_from_pipeline", None)
|
||||
from_auto_class = kwargs.pop("_from_auto", False)
|
||||
|
||||
if use_auth_token is not None:
|
||||
warnings.warn(
|
||||
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
if token is not None:
|
||||
raise ValueError(
|
||||
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
|
||||
)
|
||||
token = use_auth_token
|
||||
|
||||
user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
|
||||
if from_pipeline is not None:
|
||||
user_agent["using_pipeline"] = from_pipeline
|
||||
|
||||
if is_offline_mode() and not local_files_only:
|
||||
logger.info("Offline mode: forcing local_files_only=True")
|
||||
local_files_only = True
|
||||
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
is_local = os.path.isdir(pretrained_model_name_or_path)
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
resolved_image_processor_file = pretrained_model_name_or_path
|
||||
is_local = True
|
||||
elif is_remote_url(pretrained_model_name_or_path):
|
||||
image_processor_file = pretrained_model_name_or_path
|
||||
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
|
||||
else:
|
||||
image_processor_file = IMAGE_PROCESSOR_NAME
|
||||
try:
|
||||
# Load from local folder or from cache or download from model Hub and cache
|
||||
resolved_image_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
image_processor_file,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
)
|
||||
except EnvironmentError:
|
||||
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||
# the original exception.
|
||||
raise
|
||||
except Exception:
|
||||
# For any other exception, we throw a generic error.
|
||||
raise EnvironmentError(
|
||||
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||
f" directory containing a {IMAGE_PROCESSOR_NAME} file"
|
||||
)
|
||||
|
||||
try:
|
||||
# Load image_processor dict
|
||||
with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
image_processor_dict = json.loads(text)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
raise EnvironmentError(
|
||||
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
|
||||
)
|
||||
|
||||
if is_local:
|
||||
logger.info(f"loading configuration file {resolved_image_processor_file}")
|
||||
else:
|
||||
logger.info(
|
||||
f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
|
||||
)
|
||||
|
||||
if not is_local:
|
||||
if "auto_map" in image_processor_dict:
|
||||
image_processor_dict["auto_map"] = add_model_info_to_auto_map(
|
||||
image_processor_dict["auto_map"], pretrained_model_name_or_path
|
||||
)
|
||||
if "custom_pipelines" in image_processor_dict:
|
||||
image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
|
||||
image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
|
||||
)
|
||||
return image_processor_dict, kwargs
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
|
||||
"""
|
||||
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
|
||||
|
||||
Args:
|
||||
image_processor_dict (`Dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
|
||||
retrieved from a pretrained checkpoint by leveraging the
|
||||
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
Additional parameters from which to initialize the image processor object.
|
||||
|
||||
Returns:
|
||||
[`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
|
||||
parameters.
|
||||
"""
|
||||
image_processor_dict = image_processor_dict.copy()
|
||||
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||
|
||||
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
|
||||
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
|
||||
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
|
||||
if "size" in kwargs and "size" in image_processor_dict:
|
||||
image_processor_dict["size"] = kwargs.pop("size")
|
||||
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
|
||||
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
|
||||
|
||||
image_processor = cls(**image_processor_dict)
|
||||
|
||||
# Update image_processor with kwargs if needed
|
||||
to_remove = []
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(image_processor, key):
|
||||
setattr(image_processor, key, value)
|
||||
to_remove.append(key)
|
||||
for key in to_remove:
|
||||
kwargs.pop(key, None)
|
||||
|
||||
logger.info(f"Image processor {image_processor}")
|
||||
if return_unused_kwargs:
|
||||
return image_processor, kwargs
|
||||
else:
|
||||
return image_processor
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
output["image_processor_type"] = self.__class__.__name__
|
||||
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file: Union[str, os.PathLike]):
|
||||
"""
|
||||
Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
|
||||
file of parameters.
|
||||
|
||||
Args:
|
||||
json_file (`str` or `os.PathLike`):
|
||||
Path to the JSON file containing the parameters.
|
||||
|
||||
Returns:
|
||||
A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
|
||||
instantiated from that JSON file.
|
||||
"""
|
||||
with open(json_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
image_processor_dict = json.loads(text)
|
||||
return cls(**image_processor_dict)
|
||||
|
||||
def to_json_string(self) -> str:
|
||||
"""
|
||||
Serializes this instance to a JSON string.
|
||||
|
||||
Returns:
|
||||
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
|
||||
"""
|
||||
dictionary = self.to_dict()
|
||||
|
||||
for key, value in dictionary.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
dictionary[key] = value.tolist()
|
||||
|
||||
# make sure private name "_processor_class" is correctly
|
||||
# saved as "processor_class"
|
||||
_processor_class = dictionary.pop("_processor_class", None)
|
||||
if _processor_class is not None:
|
||||
dictionary["processor_class"] = _processor_class
|
||||
|
||||
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
|
||||
|
||||
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
||||
"""
|
||||
Save this instance to a JSON file.
|
||||
|
||||
Args:
|
||||
json_file_path (`str` or `os.PathLike`):
|
||||
Path to the JSON file in which this image_processor instance's parameters will be saved.
|
||||
"""
|
||||
with open(json_file_path, "w", encoding="utf-8") as writer:
|
||||
writer.write(self.to_json_string())
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__} {self.to_json_string()}"
|
||||
|
||||
@classmethod
|
||||
def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
|
||||
"""
|
||||
Register this class with a given auto class. This should only be used for custom image processors as the ones
|
||||
in the library are already mapped with `AutoImageProcessor `.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is experimental and may have some slight breaking changes in the next releases.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
|
||||
The auto class to register this new image processor with.
|
||||
"""
|
||||
if not isinstance(auto_class, str):
|
||||
auto_class = auto_class.__name__
|
||||
|
||||
import transformers.models.auto as auto_module
|
||||
|
||||
if not hasattr(auto_module, auto_class):
|
||||
raise ValueError(f"{auto_class} is not a valid auto class.")
|
||||
|
||||
cls._auto_class = auto_class
|
||||
|
||||
def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
|
||||
"""
|
||||
Convert a single or a list of urls into the corresponding `PIL.Image` objects.
|
||||
|
||||
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
|
||||
returned.
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
|
||||
" Safari/537.36"
|
||||
)
|
||||
}
|
||||
if isinstance(image_url_or_urls, list):
|
||||
return [self.fetch_images(x) for x in image_url_or_urls]
|
||||
elif isinstance(image_url_or_urls, str):
|
||||
response = requests.get(image_url_or_urls, stream=True, headers=headers)
|
||||
response.raise_for_status()
|
||||
return Image.open(BytesIO(response.content))
|
||||
else:
|
||||
raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
|
||||
|
||||
|
||||
class BaseImageProcessor(ImageProcessingMixin):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
@@ -801,10 +280,3 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
|
||||
best_fit = (height, width)
|
||||
|
||||
return best_fit
|
||||
|
||||
|
||||
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
|
||||
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
|
||||
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
|
||||
object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
|
||||
)
|
||||
|
||||
63
src/transformers/image_processing_utils_fast.py
Normal file
63
src/transformers/image_processing_utils_fast.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .image_processing_utils import BaseImageProcessor
|
||||
from .utils.import_utils import is_torchvision_available
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
from torchvision.transforms import Compose
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SizeDict:
|
||||
"""
|
||||
Hashable dictionary to store image size information.
|
||||
"""
|
||||
|
||||
height: int = None
|
||||
width: int = None
|
||||
longest_edge: int = None
|
||||
shortest_edge: int = None
|
||||
max_height: int = None
|
||||
max_width: int = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
if hasattr(self, key):
|
||||
return getattr(self, key)
|
||||
raise KeyError(f"Key {key} not found in SizeDict.")
|
||||
|
||||
|
||||
class BaseImageProcessorFast(BaseImageProcessor):
|
||||
_transform_params = None
|
||||
|
||||
def _build_transforms(self, **kwargs) -> "Compose":
|
||||
"""
|
||||
Given the input settings e.g. do_resize, build the image transforms.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _validate_params(self, **kwargs) -> None:
|
||||
for k, v in kwargs.items():
|
||||
if k not in self._transform_params:
|
||||
raise ValueError(f"Invalid transform parameter {k}={v}.")
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def get_transforms(self, **kwargs) -> "Compose":
|
||||
self._validate_params(**kwargs)
|
||||
return self._build_transforms(**kwargs)
|
||||
@@ -31,6 +31,7 @@ from .utils.import_utils import (
|
||||
is_flax_available,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
is_torchvision_available,
|
||||
is_vision_available,
|
||||
requires_backends,
|
||||
)
|
||||
@@ -50,6 +51,9 @@ if is_tf_available():
|
||||
if is_flax_available():
|
||||
import jax.numpy as jnp
|
||||
|
||||
if is_torchvision_available():
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
def to_channel_dimension_format(
|
||||
image: np.ndarray,
|
||||
@@ -374,6 +378,7 @@ def normalize(
|
||||
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
|
||||
channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
|
||||
num_channels = image.shape[channel_axis]
|
||||
|
||||
@@ -802,3 +807,48 @@ def flip_channel_order(
|
||||
if data_format is not None:
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
return image
|
||||
|
||||
|
||||
def _cast_tensor_to_float(x):
|
||||
if x.is_floating_point():
|
||||
return x
|
||||
return x.float()
|
||||
|
||||
|
||||
class FusedRescaleNormalize:
|
||||
"""
|
||||
Rescale and normalize the input image in one step.
|
||||
"""
|
||||
|
||||
def __init__(self, mean, std, rescale_factor: float = 1.0, inplace: bool = False):
|
||||
self.mean = torch.tensor(mean) * (1.0 / rescale_factor)
|
||||
self.std = torch.tensor(std) * (1.0 / rescale_factor)
|
||||
self.inplace = inplace
|
||||
|
||||
def __call__(self, image: "torch.Tensor"):
|
||||
image = _cast_tensor_to_float(image)
|
||||
return F.normalize(image, self.mean, self.std, inplace=self.inplace)
|
||||
|
||||
|
||||
class Rescale:
|
||||
"""
|
||||
Rescale the input image by rescale factor: image *= rescale_factor.
|
||||
"""
|
||||
|
||||
def __init__(self, rescale_factor: float = 1.0):
|
||||
self.rescale_factor = rescale_factor
|
||||
|
||||
def __call__(self, image: "torch.Tensor"):
|
||||
image = image * self.rescale_factor
|
||||
return image
|
||||
|
||||
|
||||
class NumpyToTensor:
|
||||
"""
|
||||
Convert a numpy array to a PyTorch tensor.
|
||||
"""
|
||||
|
||||
def __call__(self, image: np.ndarray):
|
||||
# Same as in PyTorch, we assume incoming numpy images are in HWC format
|
||||
# c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
|
||||
return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
|
||||
|
||||
@@ -25,9 +25,11 @@ from packaging import version
|
||||
from .utils import (
|
||||
ExplicitEnum,
|
||||
is_jax_tensor,
|
||||
is_numpy_array,
|
||||
is_tf_tensor,
|
||||
is_torch_available,
|
||||
is_torch_tensor,
|
||||
is_torchvision_available,
|
||||
is_vision_available,
|
||||
logging,
|
||||
requires_backends,
|
||||
@@ -52,6 +54,20 @@ if is_vision_available():
|
||||
else:
|
||||
PILImageResampling = PIL.Image
|
||||
|
||||
if is_torchvision_available():
|
||||
from torchvision.transforms import InterpolationMode
|
||||
|
||||
pil_torch_interpolation_mapping = {
|
||||
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
|
||||
PILImageResampling.BOX: InterpolationMode.BOX,
|
||||
PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
|
||||
PILImageResampling.HAMMING: InterpolationMode.HAMMING,
|
||||
PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
|
||||
PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
|
||||
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
|
||||
}
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
if is_torch_available():
|
||||
import torch
|
||||
@@ -90,14 +106,30 @@ def is_pil_image(img):
|
||||
return is_vision_available() and isinstance(img, PIL.Image.Image)
|
||||
|
||||
|
||||
class ImageType(ExplicitEnum):
|
||||
PIL = "pillow"
|
||||
TORCH = "torch"
|
||||
NUMPY = "numpy"
|
||||
TENSORFLOW = "tensorflow"
|
||||
JAX = "jax"
|
||||
|
||||
|
||||
def get_image_type(image):
|
||||
if is_pil_image(image):
|
||||
return ImageType.PIL
|
||||
if is_torch_tensor(image):
|
||||
return ImageType.TORCH
|
||||
if is_numpy_array(image):
|
||||
return ImageType.NUMPY
|
||||
if is_tf_tensor(image):
|
||||
return ImageType.TENSORFLOW
|
||||
if is_jax_tensor(image):
|
||||
return ImageType.JAX
|
||||
raise ValueError(f"Unrecognised image type {type(image)}")
|
||||
|
||||
|
||||
def is_valid_image(img):
|
||||
return (
|
||||
(is_vision_available() and isinstance(img, PIL.Image.Image))
|
||||
or isinstance(img, np.ndarray)
|
||||
or is_torch_tensor(img)
|
||||
or is_tf_tensor(img)
|
||||
or is_jax_tensor(img)
|
||||
)
|
||||
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
|
||||
|
||||
|
||||
def valid_images(imgs):
|
||||
|
||||
@@ -19,13 +19,21 @@ import json
|
||||
import os
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
from typing import Dict, Optional, Union
|
||||
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
||||
|
||||
# Build the list of all image processors
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||
from ...image_processing_utils import ImageProcessingMixin
|
||||
from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
|
||||
from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast
|
||||
from ...utils import (
|
||||
CONFIG_NAME,
|
||||
IMAGE_PROCESSOR_NAME,
|
||||
get_file_from_repo,
|
||||
is_torchvision_available,
|
||||
is_vision_available,
|
||||
logging,
|
||||
)
|
||||
from .auto_factory import _LazyAutoMapping
|
||||
from .configuration_auto import (
|
||||
CONFIG_MAPPING_NAMES,
|
||||
@@ -37,104 +45,125 @@ from .configuration_auto import (
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
("align", "EfficientNetImageProcessor"),
|
||||
("beit", "BeitImageProcessor"),
|
||||
("bit", "BitImageProcessor"),
|
||||
("blip", "BlipImageProcessor"),
|
||||
("blip-2", "BlipImageProcessor"),
|
||||
("bridgetower", "BridgeTowerImageProcessor"),
|
||||
("chinese_clip", "ChineseCLIPImageProcessor"),
|
||||
("clip", "CLIPImageProcessor"),
|
||||
("clipseg", "ViTImageProcessor"),
|
||||
("conditional_detr", "ConditionalDetrImageProcessor"),
|
||||
("convnext", "ConvNextImageProcessor"),
|
||||
("convnextv2", "ConvNextImageProcessor"),
|
||||
("cvt", "ConvNextImageProcessor"),
|
||||
("data2vec-vision", "BeitImageProcessor"),
|
||||
("deformable_detr", "DeformableDetrImageProcessor"),
|
||||
("deit", "DeiTImageProcessor"),
|
||||
("depth_anything", "DPTImageProcessor"),
|
||||
("deta", "DetaImageProcessor"),
|
||||
("detr", "DetrImageProcessor"),
|
||||
("dinat", "ViTImageProcessor"),
|
||||
("dinov2", "BitImageProcessor"),
|
||||
("donut-swin", "DonutImageProcessor"),
|
||||
("dpt", "DPTImageProcessor"),
|
||||
("efficientformer", "EfficientFormerImageProcessor"),
|
||||
("efficientnet", "EfficientNetImageProcessor"),
|
||||
("flava", "FlavaImageProcessor"),
|
||||
("focalnet", "BitImageProcessor"),
|
||||
("fuyu", "FuyuImageProcessor"),
|
||||
("git", "CLIPImageProcessor"),
|
||||
("glpn", "GLPNImageProcessor"),
|
||||
("grounding-dino", "GroundingDinoImageProcessor"),
|
||||
("groupvit", "CLIPImageProcessor"),
|
||||
("idefics", "IdeficsImageProcessor"),
|
||||
("idefics2", "Idefics2ImageProcessor"),
|
||||
("imagegpt", "ImageGPTImageProcessor"),
|
||||
("instructblip", "BlipImageProcessor"),
|
||||
("kosmos-2", "CLIPImageProcessor"),
|
||||
("layoutlmv2", "LayoutLMv2ImageProcessor"),
|
||||
("layoutlmv3", "LayoutLMv3ImageProcessor"),
|
||||
("levit", "LevitImageProcessor"),
|
||||
("llava", "CLIPImageProcessor"),
|
||||
("llava_next", "LlavaNextImageProcessor"),
|
||||
("mask2former", "Mask2FormerImageProcessor"),
|
||||
("maskformer", "MaskFormerImageProcessor"),
|
||||
("mgp-str", "ViTImageProcessor"),
|
||||
("mobilenet_v1", "MobileNetV1ImageProcessor"),
|
||||
("mobilenet_v2", "MobileNetV2ImageProcessor"),
|
||||
("mobilevit", "MobileViTImageProcessor"),
|
||||
("mobilevit", "MobileViTImageProcessor"),
|
||||
("mobilevitv2", "MobileViTImageProcessor"),
|
||||
("nat", "ViTImageProcessor"),
|
||||
("nougat", "NougatImageProcessor"),
|
||||
("oneformer", "OneFormerImageProcessor"),
|
||||
("owlv2", "Owlv2ImageProcessor"),
|
||||
("owlvit", "OwlViTImageProcessor"),
|
||||
("paligemma", "CLIPImageProcessor"),
|
||||
("perceiver", "PerceiverImageProcessor"),
|
||||
("pix2struct", "Pix2StructImageProcessor"),
|
||||
("poolformer", "PoolFormerImageProcessor"),
|
||||
("pvt", "PvtImageProcessor"),
|
||||
("pvt_v2", "PvtImageProcessor"),
|
||||
("regnet", "ConvNextImageProcessor"),
|
||||
("resnet", "ConvNextImageProcessor"),
|
||||
("sam", "SamImageProcessor"),
|
||||
("segformer", "SegformerImageProcessor"),
|
||||
("seggpt", "SegGptImageProcessor"),
|
||||
("siglip", "SiglipImageProcessor"),
|
||||
("swiftformer", "ViTImageProcessor"),
|
||||
("swin", "ViTImageProcessor"),
|
||||
("swin2sr", "Swin2SRImageProcessor"),
|
||||
("swinv2", "ViTImageProcessor"),
|
||||
("table-transformer", "DetrImageProcessor"),
|
||||
("timesformer", "VideoMAEImageProcessor"),
|
||||
("tvlt", "TvltImageProcessor"),
|
||||
("tvp", "TvpImageProcessor"),
|
||||
("udop", "LayoutLMv3ImageProcessor"),
|
||||
("upernet", "SegformerImageProcessor"),
|
||||
("van", "ConvNextImageProcessor"),
|
||||
("video_llava", "VideoLlavaImageProcessor"),
|
||||
("videomae", "VideoMAEImageProcessor"),
|
||||
("vilt", "ViltImageProcessor"),
|
||||
("vipllava", "CLIPImageProcessor"),
|
||||
("vit", "ViTImageProcessor"),
|
||||
("vit_hybrid", "ViTHybridImageProcessor"),
|
||||
("vit_mae", "ViTImageProcessor"),
|
||||
("vit_msn", "ViTImageProcessor"),
|
||||
("vitmatte", "VitMatteImageProcessor"),
|
||||
("xclip", "CLIPImageProcessor"),
|
||||
("yolos", "YolosImageProcessor"),
|
||||
]
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This significantly improves completion suggestion performance when
|
||||
# the transformers package is used with Microsoft's Pylance language server.
|
||||
IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
|
||||
else:
|
||||
IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
("align", ("EfficientNetImageProcessor",)),
|
||||
("beit", ("BeitImageProcessor",)),
|
||||
("bit", ("BitImageProcessor",)),
|
||||
("blip", ("BlipImageProcessor",)),
|
||||
("blip-2", ("BlipImageProcessor",)),
|
||||
("bridgetower", ("BridgeTowerImageProcessor",)),
|
||||
("chinese_clip", ("ChineseCLIPImageProcessor",)),
|
||||
("clip", ("CLIPImageProcessor",)),
|
||||
("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("conditional_detr", ("ConditionalDetrImageProcessor",)),
|
||||
("convnext", ("ConvNextImageProcessor",)),
|
||||
("convnextv2", ("ConvNextImageProcessor",)),
|
||||
("cvt", ("ConvNextImageProcessor",)),
|
||||
("data2vec-vision", ("BeitImageProcessor",)),
|
||||
("deformable_detr", ("DeformableDetrImageProcessor",)),
|
||||
("deit", ("DeiTImageProcessor",)),
|
||||
("depth_anything", ("DPTImageProcessor",)),
|
||||
("deta", ("DetaImageProcessor",)),
|
||||
("detr", ("DetrImageProcessor",)),
|
||||
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("dinov2", ("BitImageProcessor",)),
|
||||
("donut-swin", ("DonutImageProcessor",)),
|
||||
("dpt", ("DPTImageProcessor",)),
|
||||
("efficientformer", ("EfficientFormerImageProcessor",)),
|
||||
("efficientnet", ("EfficientNetImageProcessor",)),
|
||||
("flava", ("FlavaImageProcessor",)),
|
||||
("focalnet", ("BitImageProcessor",)),
|
||||
("fuyu", ("FuyuImageProcessor",)),
|
||||
("git", ("CLIPImageProcessor",)),
|
||||
("glpn", ("GLPNImageProcessor",)),
|
||||
("grounding-dino", ("GroundingDinoImageProcessor",)),
|
||||
("groupvit", ("CLIPImageProcessor",)),
|
||||
("idefics", ("IdeficsImageProcessor",)),
|
||||
("idefics2", ("Idefics2ImageProcessor",)),
|
||||
("imagegpt", ("ImageGPTImageProcessor",)),
|
||||
("instructblip", ("BlipImageProcessor",)),
|
||||
("kosmos-2", ("CLIPImageProcessor",)),
|
||||
("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
|
||||
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
|
||||
("levit", ("LevitImageProcessor",)),
|
||||
("llava", ("CLIPImageProcessor",)),
|
||||
("llava_next", ("LlavaNextImageProcessor",)),
|
||||
("mask2former", ("Mask2FormerImageProcessor",)),
|
||||
("maskformer", ("MaskFormerImageProcessor",)),
|
||||
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
|
||||
("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
|
||||
("mobilevit", ("MobileViTImageProcessor",)),
|
||||
("mobilevit", ("MobileViTImageProcessor",)),
|
||||
("mobilevitv2", ("MobileViTImageProcessor",)),
|
||||
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("nougat", ("NougatImageProcessor",)),
|
||||
("oneformer", ("OneFormerImageProcessor",)),
|
||||
("owlv2", ("Owlv2ImageProcessor",)),
|
||||
("owlvit", ("OwlViTImageProcessor",)),
|
||||
("perceiver", ("PerceiverImageProcessor",)),
|
||||
("pix2struct", ("Pix2StructImageProcessor",)),
|
||||
("poolformer", ("PoolFormerImageProcessor",)),
|
||||
("pvt", ("PvtImageProcessor",)),
|
||||
("pvt_v2", ("PvtImageProcessor",)),
|
||||
("regnet", ("ConvNextImageProcessor",)),
|
||||
("resnet", ("ConvNextImageProcessor",)),
|
||||
("sam", ("SamImageProcessor",)),
|
||||
("segformer", ("SegformerImageProcessor",)),
|
||||
("seggpt", ("SegGptImageProcessor",)),
|
||||
("siglip", ("SiglipImageProcessor",)),
|
||||
("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("swin2sr", ("Swin2SRImageProcessor",)),
|
||||
("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("table-transformer", ("DetrImageProcessor",)),
|
||||
("timesformer", ("VideoMAEImageProcessor",)),
|
||||
("tvlt", ("TvltImageProcessor",)),
|
||||
("tvp", ("TvpImageProcessor",)),
|
||||
("udop", ("LayoutLMv3ImageProcessor",)),
|
||||
("upernet", ("SegformerImageProcessor",)),
|
||||
("van", ("ConvNextImageProcessor",)),
|
||||
("videomae", ("VideoMAEImageProcessor",)),
|
||||
("vilt", ("ViltImageProcessor",)),
|
||||
("vipllava", ("CLIPImageProcessor",)),
|
||||
("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("vit_hybrid", ("ViTHybridImageProcessor",)),
|
||||
("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("vitmatte", ("VitMatteImageProcessor",)),
|
||||
("xclip", ("CLIPImageProcessor",)),
|
||||
("yolos", ("YolosImageProcessor",)),
|
||||
]
|
||||
)
|
||||
|
||||
for model_type, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
|
||||
slow_image_processor_class, *fast_image_processor_class = image_processors
|
||||
if not is_vision_available():
|
||||
slow_image_processor_class = None
|
||||
|
||||
# If the fast image processor is not defined, or torchvision is not available, we set it to None
|
||||
if not fast_image_processor_class or fast_image_processor_class[0] is None or not is_torchvision_available():
|
||||
fast_image_processor_class = None
|
||||
else:
|
||||
fast_image_processor_class = fast_image_processor_class[0]
|
||||
|
||||
IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class)
|
||||
|
||||
|
||||
IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
|
||||
|
||||
|
||||
def image_processor_class_from_name(class_name: str):
|
||||
if class_name == "BaseImageProcessorFast":
|
||||
return BaseImageProcessorFast
|
||||
|
||||
for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
|
||||
if class_name in extractors:
|
||||
module_name = model_type_to_module_name(module_name)
|
||||
@@ -145,11 +174,12 @@ def image_processor_class_from_name(class_name: str):
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
for _, extractor in IMAGE_PROCESSOR_MAPPING._extra_content.items():
|
||||
if getattr(extractor, "__name__", None) == class_name:
|
||||
return extractor
|
||||
for _, extractors in IMAGE_PROCESSOR_MAPPING._extra_content.items():
|
||||
for extractor in extractors:
|
||||
if getattr(extractor, "__name__", None) == class_name:
|
||||
return extractor
|
||||
|
||||
# We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
|
||||
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
|
||||
# init and we return the proper dummy to get an appropriate error message.
|
||||
main_module = importlib.import_module("transformers")
|
||||
if hasattr(main_module, class_name):
|
||||
@@ -258,6 +288,13 @@ def get_image_processor_config(
|
||||
return json.load(reader)
|
||||
|
||||
|
||||
def _warning_fast_image_processor_available(fast_class):
|
||||
logger.warning(
|
||||
f"Fast image processor class {fast_class} is available for this model. "
|
||||
"Using slow image processor class. To use the fast image processor class set `use_fast=True`."
|
||||
)
|
||||
|
||||
|
||||
class AutoImageProcessor:
|
||||
r"""
|
||||
This is a generic image processor class that will be instantiated as one of the image processor classes of the
|
||||
@@ -274,7 +311,7 @@ class AutoImageProcessor:
|
||||
|
||||
@classmethod
|
||||
@replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
r"""
|
||||
Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
|
||||
|
||||
@@ -314,6 +351,10 @@ class AutoImageProcessor:
|
||||
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
|
||||
identifier allowed by git.
|
||||
use_fast (`bool`, *optional*, defaults to `False`):
|
||||
Use a fast torchvision-base image processor if it is supported for a given model.
|
||||
If a fast tokenizer is not available for a given model, a normal numpy-based image processor
|
||||
is returned instead.
|
||||
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
|
||||
If `False`, then this function returns just the final image processor object. If `True`, then this
|
||||
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
||||
@@ -358,6 +399,7 @@ class AutoImageProcessor:
|
||||
kwargs["token"] = use_auth_token
|
||||
|
||||
config = kwargs.pop("config", None)
|
||||
use_fast = kwargs.pop("use_fast", False)
|
||||
trust_remote_code = kwargs.pop("trust_remote_code", None)
|
||||
kwargs["_from_auto"] = True
|
||||
|
||||
@@ -387,6 +429,11 @@ class AutoImageProcessor:
|
||||
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
|
||||
|
||||
if image_processor_class is not None:
|
||||
# Update class name to reflect the use_fast option. If class is not found, None is returned.
|
||||
if use_fast and not image_processor_class.endswith("Fast"):
|
||||
image_processor_class += "Fast"
|
||||
elif not use_fast and image_processor_class.endswith("Fast"):
|
||||
image_processor_class = image_processor_class[:-4]
|
||||
image_processor_class = image_processor_class_from_name(image_processor_class)
|
||||
|
||||
has_remote_code = image_processor_auto_map is not None
|
||||
@@ -395,10 +442,19 @@ class AutoImageProcessor:
|
||||
trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
|
||||
)
|
||||
|
||||
if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple):
|
||||
# In some configs, only the slow image processor class is stored
|
||||
image_processor_auto_map = (image_processor_auto_map, None)
|
||||
|
||||
if has_remote_code and trust_remote_code:
|
||||
image_processor_class = get_class_from_dynamic_module(
|
||||
image_processor_auto_map, pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
if not use_fast and image_processor_auto_map[1] is not None:
|
||||
_warning_fast_image_processor_available(image_processor_auto_map[1])
|
||||
|
||||
if use_fast and image_processor_auto_map[1] is not None:
|
||||
class_ref = image_processor_auto_map[1]
|
||||
else:
|
||||
class_ref = image_processor_auto_map[0]
|
||||
image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
|
||||
_ = kwargs.pop("code_revision", None)
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
image_processor_class.register_for_auto_class()
|
||||
@@ -407,8 +463,22 @@ class AutoImageProcessor:
|
||||
return image_processor_class.from_dict(config_dict, **kwargs)
|
||||
# Last try: we use the IMAGE_PROCESSOR_MAPPING.
|
||||
elif type(config) in IMAGE_PROCESSOR_MAPPING:
|
||||
image_processor_class = IMAGE_PROCESSOR_MAPPING[type(config)]
|
||||
return image_processor_class.from_dict(config_dict, **kwargs)
|
||||
image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)]
|
||||
|
||||
image_processor_class_py, image_processor_class_fast = image_processor_tuple
|
||||
|
||||
if not use_fast and image_processor_class_fast is not None:
|
||||
_warning_fast_image_processor_available(image_processor_class_fast)
|
||||
|
||||
if image_processor_class_fast and (use_fast or image_processor_class_py is None):
|
||||
return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
else:
|
||||
if image_processor_class_py is not None:
|
||||
return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
"This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
|
||||
@@ -417,7 +487,13 @@ class AutoImageProcessor:
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def register(config_class, image_processor_class, exist_ok=False):
|
||||
def register(
|
||||
config_class,
|
||||
image_processor_class=None,
|
||||
slow_image_processor_class=None,
|
||||
fast_image_processor_class=None,
|
||||
exist_ok=False,
|
||||
):
|
||||
"""
|
||||
Register a new image processor for this class.
|
||||
|
||||
@@ -426,4 +502,43 @@ class AutoImageProcessor:
|
||||
The configuration corresponding to the model to register.
|
||||
image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
|
||||
"""
|
||||
IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class, exist_ok=exist_ok)
|
||||
if image_processor_class is not None:
|
||||
if slow_image_processor_class is not None:
|
||||
raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class")
|
||||
warnings.warn(
|
||||
"The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead",
|
||||
FutureWarning,
|
||||
)
|
||||
slow_image_processor_class = image_processor_class
|
||||
|
||||
if slow_image_processor_class is None and fast_image_processor_class is None:
|
||||
raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
|
||||
if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
|
||||
raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
|
||||
if fast_image_processor_class is not None and issubclass(fast_image_processor_class, BaseImageProcessor):
|
||||
raise ValueError("You passed a slow image processor in as the `fast_image_processor_class`.")
|
||||
|
||||
if (
|
||||
slow_image_processor_class is not None
|
||||
and fast_image_processor_class is not None
|
||||
and issubclass(fast_image_processor_class, BaseImageProcessorFast)
|
||||
and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class
|
||||
):
|
||||
raise ValueError(
|
||||
"The fast processor class you are passing has a `slow_image_processor_class` attribute that is not "
|
||||
"consistent with the slow processor class you passed (fast tokenizer has "
|
||||
f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those "
|
||||
"so they match!"
|
||||
)
|
||||
|
||||
# Avoid resetting a set slow/fast image processor if we are passing just the other ones.
|
||||
if config_class in IMAGE_PROCESSOR_MAPPING._extra_content:
|
||||
existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class]
|
||||
if slow_image_processor_class is None:
|
||||
slow_image_processor_class = existing_slow
|
||||
if fast_image_processor_class is None:
|
||||
fast_image_processor_class = existing_fast
|
||||
|
||||
IMAGE_PROCESSOR_MAPPING.register(
|
||||
config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok
|
||||
)
|
||||
|
||||
@@ -19,6 +19,7 @@ from ...utils import (
|
||||
is_flax_available,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
is_torchvision_available,
|
||||
is_vision_available,
|
||||
)
|
||||
|
||||
@@ -34,6 +35,15 @@ else:
|
||||
_import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
|
||||
_import_structure["image_processing_vit"] = ["ViTImageProcessor"]
|
||||
|
||||
|
||||
try:
|
||||
if not is_torchvision_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
_import_structure["image_processing_vit_fast"] = ["ViTImageProcessorFast"]
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
@@ -83,6 +93,14 @@ if TYPE_CHECKING:
|
||||
from .feature_extraction_vit import ViTFeatureExtractor
|
||||
from .image_processing_vit import ViTImageProcessor
|
||||
|
||||
try:
|
||||
if not is_torchvision_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
from .image_processing_vit_fast import ViTImageProcessorFast
|
||||
|
||||
try:
|
||||
if not is_torch_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
|
||||
289
src/transformers/models/vit/image_processing_vit_fast.py
Normal file
289
src/transformers/models/vit/image_processing_vit_fast.py
Normal file
@@ -0,0 +1,289 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast Image processor class for ViT."""
|
||||
|
||||
import functools
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from ...image_processing_base import BatchFeature
|
||||
from ...image_processing_utils import get_size_dict
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
|
||||
from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
|
||||
from ...image_utils import (
|
||||
IMAGENET_STANDARD_MEAN,
|
||||
IMAGENET_STANDARD_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
ImageType,
|
||||
PILImageResampling,
|
||||
get_image_type,
|
||||
make_list_of_images,
|
||||
pil_torch_interpolation_mapping,
|
||||
)
|
||||
from ...utils import TensorType, logging
|
||||
from ...utils.import_utils import is_torch_available, is_torchvision_available
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
|
||||
|
||||
|
||||
class ViTImageProcessorFast(BaseImageProcessorFast):
|
||||
r"""
|
||||
Constructs a ViT image processor.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
|
||||
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
|
||||
method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||
`preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
|
||||
parameter in the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||
`preprocess` method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
method.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
_transform_params = [
|
||||
"do_resize",
|
||||
"do_rescale",
|
||||
"do_normalize",
|
||||
"size",
|
||||
"resample",
|
||||
"rescale_factor",
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"image_type",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BILINEAR,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
size = size if size is not None else {"height": 224, "width": 224}
|
||||
size = get_size_dict(size)
|
||||
self.do_resize = do_resize
|
||||
self.do_rescale = do_rescale
|
||||
self.do_normalize = do_normalize
|
||||
self.size = size
|
||||
self.resample = resample
|
||||
self.rescale_factor = rescale_factor
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
|
||||
self._transform_settings = {}
|
||||
|
||||
def _build_transforms(
|
||||
self,
|
||||
do_resize: bool,
|
||||
size: Dict[str, int],
|
||||
resample: PILImageResampling,
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Union[float, List[float]],
|
||||
image_std: Union[float, List[float]],
|
||||
image_type: ImageType,
|
||||
) -> "Compose":
|
||||
"""
|
||||
Given the input settings build the image transforms using `torchvision.transforms.Compose`.
|
||||
"""
|
||||
transforms = []
|
||||
|
||||
# All PIL and numpy values need to be converted to a torch tensor
|
||||
# to keep cross compatibility with slow image processors
|
||||
if image_type == ImageType.PIL:
|
||||
transforms.append(PILToTensor())
|
||||
|
||||
elif image_type == ImageType.NUMPY:
|
||||
transforms.append(NumpyToTensor())
|
||||
|
||||
if do_resize:
|
||||
transforms.append(
|
||||
Resize((size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample])
|
||||
)
|
||||
|
||||
# We can combine rescale and normalize into a single operation for speed
|
||||
if do_rescale and do_normalize:
|
||||
transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
|
||||
elif do_rescale:
|
||||
transforms.append(Rescale(rescale_factor=rescale_factor))
|
||||
elif do_normalize:
|
||||
transforms.append(Normalize(image_mean, image_std))
|
||||
|
||||
return Compose(transforms)
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _validate_input_arguments(
|
||||
self,
|
||||
return_tensors: Union[str, TensorType],
|
||||
do_resize: bool,
|
||||
size: Dict[str, int],
|
||||
resample: PILImageResampling,
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Union[float, List[float]],
|
||||
image_std: Union[float, List[float]],
|
||||
data_format: Union[str, ChannelDimension],
|
||||
image_type: ImageType,
|
||||
):
|
||||
if return_tensors != "pt":
|
||||
raise ValueError("Only returning PyTorch tensors is currently supported.")
|
||||
|
||||
if data_format != ChannelDimension.FIRST:
|
||||
raise ValueError("Only channel first data format is currently supported.")
|
||||
|
||||
if do_resize and None in (size, resample):
|
||||
raise ValueError("Size and resample must be specified if do_resize is True.")
|
||||
|
||||
if do_rescale and rescale_factor is None:
|
||||
raise ValueError("Rescale factor must be specified if do_rescale is True.")
|
||||
|
||||
if do_normalize and None in (image_mean, image_std):
|
||||
raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Dict[str, int] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = "pt",
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
|
||||
resizing.
|
||||
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
|
||||
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
|
||||
an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image values between [0 - 1].
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use if `do_normalize` is set to `True`.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Only "pt" is supported
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. The following formats are currently supported:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
size = size if size is not None else self.size
|
||||
# Make hashable for cache
|
||||
size = SizeDict(**size)
|
||||
image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
|
||||
image_std = tuple(image_std) if isinstance(image_std, list) else image_std
|
||||
|
||||
images = make_list_of_images(images)
|
||||
image_type = get_image_type(images[0])
|
||||
|
||||
if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
|
||||
raise ValueError(f"Unsupported input image type {image_type}")
|
||||
|
||||
self._validate_input_arguments(
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
return_tensors=return_tensors,
|
||||
data_format=data_format,
|
||||
image_type=image_type,
|
||||
)
|
||||
|
||||
transforms = self.get_transforms(
|
||||
do_resize=do_resize,
|
||||
do_rescale=do_rescale,
|
||||
do_normalize=do_normalize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
rescale_factor=rescale_factor,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
image_type=image_type,
|
||||
)
|
||||
transformed_images = [transforms(image) for image in images]
|
||||
|
||||
data = {"pixel_values": torch.vstack(transformed_images)}
|
||||
return BatchFeature(data, tensor_type=return_tensors)
|
||||
16
src/transformers/utils/dummy_torchvision_objects.py
Normal file
16
src/transformers/utils/dummy_torchvision_objects.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# This file is autogenerated by the command `make fix-copies`, do not edit.
|
||||
from ..utils import DummyObject, requires_backends
|
||||
|
||||
|
||||
class BaseImageProcessorFast(metaclass=DummyObject):
|
||||
_backends = ["torchvision"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torchvision"])
|
||||
|
||||
|
||||
class ViTImageProcessorFast(metaclass=DummyObject):
|
||||
_backends = ["torchvision"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torchvision"])
|
||||
@@ -9,6 +9,13 @@ class ImageProcessingMixin(metaclass=DummyObject):
|
||||
requires_backends(self, ["vision"])
|
||||
|
||||
|
||||
class BaseImageProcessor(metaclass=DummyObject):
|
||||
_backends = ["vision"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["vision"])
|
||||
|
||||
|
||||
class ImageFeatureExtractionMixin(metaclass=DummyObject):
|
||||
_backends = ["vision"]
|
||||
|
||||
|
||||
@@ -27,8 +27,10 @@ from transformers import (
|
||||
AutoImageProcessor,
|
||||
CLIPConfig,
|
||||
CLIPImageProcessor,
|
||||
ViTImageProcessor,
|
||||
ViTImageProcessorFast,
|
||||
)
|
||||
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
|
||||
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
|
||||
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||
@@ -133,6 +135,23 @@ class AutoImageProcessorTest(unittest.TestCase):
|
||||
):
|
||||
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
|
||||
|
||||
@require_vision
|
||||
@require_torchvision
|
||||
def test_use_fast_selection(self):
|
||||
checkpoint = "hf-internal-testing/tiny-random-vit"
|
||||
|
||||
# Slow image processor is selected by default
|
||||
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
|
||||
self.assertIsInstance(image_processor, ViTImageProcessor)
|
||||
|
||||
# Fast image processor is selected when use_fast=True
|
||||
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
|
||||
self.assertIsInstance(image_processor, ViTImageProcessorFast)
|
||||
|
||||
# Slow image processor is selected when use_fast=False
|
||||
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
|
||||
self.assertIsInstance(image_processor, ViTImageProcessor)
|
||||
|
||||
def test_from_pretrained_dynamic_image_processor(self):
|
||||
# If remote code is not set, we will time out when asking whether to load the model.
|
||||
with self.assertRaises(ValueError):
|
||||
|
||||
@@ -121,6 +121,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = BeitImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = BeitImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -90,6 +90,7 @@ class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = BlipImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = BlipImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
@@ -112,6 +113,7 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
|
||||
image_processing_class = BlipImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
|
||||
self.expected_encoded_image_num_channels = 3
|
||||
|
||||
|
||||
@@ -136,6 +136,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
||||
image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = BridgeTowerImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -98,6 +98,7 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
||||
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
|
||||
|
||||
@property
|
||||
@@ -135,6 +136,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
|
||||
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
|
||||
self.expected_encoded_image_num_channels = 3
|
||||
|
||||
|
||||
@@ -94,6 +94,7 @@ class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = CLIPImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = CLIPImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -131,6 +131,7 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
|
||||
image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -87,6 +87,7 @@ class ConvNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = ConvNextImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ConvNextImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -131,6 +131,7 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
|
||||
image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DeformableDetrImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -93,6 +93,7 @@ class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
test_cast_dtype = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DeiTImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -130,6 +130,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
||||
image_processing_class = DetrImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DetrImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -99,6 +99,7 @@ class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = DonutImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DonutImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -86,6 +86,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = DPTImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DPTImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -86,6 +86,7 @@ class EfficientNetImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase
|
||||
image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = EfficientNetImageProcessorTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -175,6 +175,7 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
maxDiff = None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = FlavaImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -93,6 +93,7 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = GLPNImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = GLPNImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -146,6 +146,7 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
|
||||
image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = GroundingDinoImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -127,6 +127,7 @@ class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = IdeficsImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = IdeficsImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -185,6 +185,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Idefics2ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -22,7 +22,8 @@ import unittest
|
||||
import numpy as np
|
||||
from datasets import load_dataset
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision, slow
|
||||
from transformers import AutoImageProcessor
|
||||
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
@@ -96,6 +97,7 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ImageGPTImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
@@ -141,18 +143,38 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(image_processor_first[key], value)
|
||||
|
||||
def test_image_processor_from_and_save_pretrained(self):
|
||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
image_processor_first.save_pretrained(tmpdirname)
|
||||
image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
image_processor_first.save_pretrained(tmpdirname)
|
||||
image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
|
||||
|
||||
image_processor_first = image_processor_first.to_dict()
|
||||
for key, value in image_processor_first.items():
|
||||
if key == "clusters":
|
||||
self.assertTrue(np.array_equal(value, image_processor_second[key]))
|
||||
else:
|
||||
self.assertEqual(image_processor_first[key], value)
|
||||
image_processor_first = image_processor_first.to_dict()
|
||||
for key, value in image_processor_first.items():
|
||||
if key == "clusters":
|
||||
self.assertTrue(np.array_equal(value, image_processor_second[key]))
|
||||
else:
|
||||
self.assertEqual(image_processor_first[key], value)
|
||||
|
||||
def test_image_processor_save_load_with_autoimageprocessor(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
|
||||
image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
|
||||
|
||||
image_processor_first = image_processor_first.to_dict()
|
||||
image_processor_second = image_processor_second.to_dict()
|
||||
|
||||
for key, value in image_processor_first.items():
|
||||
if key == "clusters":
|
||||
self.assertTrue(np.array_equal(value, image_processor_second[key]))
|
||||
else:
|
||||
self.assertEqual(image_processor_first[key], value)
|
||||
|
||||
@unittest.skip("ImageGPT requires clusters at initialization")
|
||||
def test_init_without_params(self):
|
||||
|
||||
@@ -76,6 +76,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = LayoutLMv2ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -76,6 +76,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = LayoutLMv3ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -91,6 +91,7 @@ class LevitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = LevitImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = LevitImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -105,6 +105,7 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = LlavaNextImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -149,6 +149,7 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
||||
image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Mask2FormerImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -149,6 +149,7 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = MaskFormerImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -82,6 +82,7 @@ class MobileNetV1ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
||||
image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -82,6 +82,7 @@ class MobileNetV2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
|
||||
image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -112,6 +112,7 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = MobileViTImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = MobileViTImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -111,6 +111,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = NougatImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = NougatImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -159,6 +159,7 @@ class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = image_processing_class
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = OneFormerImageProcessorTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -90,6 +90,7 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Owlv2ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -92,6 +92,7 @@ class OwlViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = OwlViTImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = OwlViTImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -87,6 +87,7 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Pix2StructImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
@@ -288,6 +289,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
|
||||
image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
|
||||
self.expected_encoded_image_num_channels = 3
|
||||
|
||||
|
||||
@@ -88,6 +88,7 @@ class PoolFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = PoolFormerImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -84,6 +84,7 @@ class PvtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = PvtImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = PvtImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -112,6 +112,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = SegformerImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = SegformerImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -114,6 +114,7 @@ class SegGptImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = SegGptImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = SegGptImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -91,6 +91,7 @@ class SiglipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = SiglipImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = SiglipImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -77,6 +77,7 @@ class SuperPointImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processing_class = SuperPointImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
self.image_processor_tester = SuperPointImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -98,6 +98,7 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Swin2SRImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -127,6 +127,7 @@ class TvpImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = TvpImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = TvpImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -128,6 +128,7 @@ class VideoLlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = VideoLlavaImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -99,6 +99,7 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = VideoMAEImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -130,6 +130,7 @@ class ViltImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = ViltImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ViltImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -84,6 +84,7 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = ViTImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = ViTImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
@@ -91,16 +92,18 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
@@ -94,6 +94,7 @@ class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = VitMatteImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = VitMatteImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -99,6 +99,7 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = VivitImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = VivitImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -143,6 +143,7 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
||||
image_processing_class = YolosImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = YolosImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
|
||||
@@ -19,7 +19,9 @@ import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
|
||||
from transformers import BatchFeature
|
||||
import requests
|
||||
|
||||
from transformers import AutoImageProcessor, BatchFeature
|
||||
from transformers.image_utils import AnnotationFormat, AnnotionFormat
|
||||
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
@@ -129,176 +131,263 @@ def prepare_video_inputs(
|
||||
|
||||
class ImageProcessingTestMixin:
|
||||
test_cast_dtype = None
|
||||
image_processing_class = None
|
||||
fast_image_processing_class = None
|
||||
image_processors_list = None
|
||||
test_slow_image_processor = True
|
||||
test_fast_image_processor = True
|
||||
|
||||
def setUp(self):
|
||||
image_processor_list = []
|
||||
|
||||
if self.test_slow_image_processor and self.image_processing_class:
|
||||
image_processor_list.append(self.image_processing_class)
|
||||
|
||||
if self.test_fast_image_processor and self.fast_image_processing_class:
|
||||
image_processor_list.append(self.fast_image_processing_class)
|
||||
|
||||
self.image_processor_list = image_processor_list
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence(self):
|
||||
dummy_image = Image.open(
|
||||
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
|
||||
)
|
||||
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest("Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest("Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||
|
||||
self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-3))
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_fast_is_faster_than_slow(self):
|
||||
import time
|
||||
|
||||
def measure_time(self, image_processor, dummy_image):
|
||||
start = time.time()
|
||||
_ = image_processor(dummy_image, return_tensors="pt")
|
||||
return time.time() - start
|
||||
|
||||
dummy_image = Image.open(
|
||||
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
|
||||
)
|
||||
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest("Skipping speed test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest("Skipping speed test as one of the image processors is not defined")
|
||||
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
slow_time = self.measure_time(image_processor_slow, dummy_image)
|
||||
fast_time = self.measure_time(image_processor_fast, dummy_image)
|
||||
|
||||
self.assertLessEqual(fast_time, slow_time)
|
||||
|
||||
def test_image_processor_to_json_string(self):
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
obj = json.loads(image_processor.to_json_string())
|
||||
for key, value in self.image_processor_dict.items():
|
||||
self.assertEqual(obj[key], value)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
obj = json.loads(image_processor.to_json_string())
|
||||
for key, value in self.image_processor_dict.items():
|
||||
self.assertEqual(obj[key], value)
|
||||
|
||||
def test_image_processor_to_json_file(self):
|
||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
json_file_path = os.path.join(tmpdirname, "image_processor.json")
|
||||
image_processor_first.to_json_file(json_file_path)
|
||||
image_processor_second = self.image_processing_class.from_json_file(json_file_path)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
json_file_path = os.path.join(tmpdirname, "image_processor.json")
|
||||
image_processor_first.to_json_file(json_file_path)
|
||||
image_processor_second = image_processing_class.from_json_file(json_file_path)
|
||||
|
||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||
|
||||
def test_image_processor_from_and_save_pretrained(self):
|
||||
image_processor_first = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
image_processor_second = image_processing_class.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||
|
||||
def test_image_processor_save_load_with_autoimageprocessor(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor_first = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
|
||||
image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
|
||||
|
||||
def test_init_without_params(self):
|
||||
image_processor = self.image_processing_class()
|
||||
self.assertIsNotNone(image_processor)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class()
|
||||
self.assertIsNotNone(image_processor)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_cast_dtype_device(self):
|
||||
if self.test_cast_dtype is not None:
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
if self.test_cast_dtype is not None:
|
||||
# Initialize image_processor
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
# create random PyTorch tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
|
||||
encoding = image_processor(image_inputs, return_tensors="pt")
|
||||
# for layoutLM compatiblity
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.float32)
|
||||
|
||||
encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
||||
|
||||
encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
_ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
|
||||
|
||||
# Try with text + image feature
|
||||
encoding = image_processor(image_inputs, return_tensors="pt")
|
||||
encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
|
||||
encoding = encoding.to(torch.float16)
|
||||
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
||||
self.assertEqual(encoding.input_ids.dtype, torch.long)
|
||||
|
||||
def test_call_pil(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_call_numpy(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
|
||||
encoding = image_processor(image_inputs, return_tensors="pt")
|
||||
# for layoutLM compatiblity
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.float32)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, torch.Tensor)
|
||||
|
||||
encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
_ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
|
||||
|
||||
# Try with text + image feature
|
||||
encoding = image_processor(image_inputs, return_tensors="pt")
|
||||
encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
|
||||
encoding = encoding.to(torch.float16)
|
||||
|
||||
self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
|
||||
self.assertEqual(encoding.pixel_values.dtype, torch.float16)
|
||||
self.assertEqual(encoding.input_ids.dtype, torch.long)
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape),
|
||||
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
||||
)
|
||||
# Test batched
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape),
|
||||
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
||||
)
|
||||
|
||||
def test_call_numpy_4_channels(self):
|
||||
# Test that can process images which have an arbitrary number of channels
|
||||
# Initialize image_processing
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Test that can process images which have an arbitrary number of channels
|
||||
# Initialize image_processing
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
# create random numpy tensors
|
||||
self.image_processor_tester.num_channels = 4
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
# create random numpy tensors
|
||||
self.image_processor_tester.num_channels = 4
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0],
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_first",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
).pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0],
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_first",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
).pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs,
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_first",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
).pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs,
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_first",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
).pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_image_processor_preprocess_arguments(self):
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
|
||||
preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
|
||||
preprocess_parameter_names.remove("self")
|
||||
preprocess_parameter_names.sort()
|
||||
valid_processor_keys = image_processor._valid_processor_keys
|
||||
valid_processor_keys.sort()
|
||||
self.assertEqual(preprocess_parameter_names, valid_processor_keys)
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
|
||||
preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
|
||||
preprocess_parameter_names.remove("self")
|
||||
preprocess_parameter_names.sort()
|
||||
valid_processor_keys = image_processor._valid_processor_keys
|
||||
valid_processor_keys.sort()
|
||||
self.assertEqual(preprocess_parameter_names, valid_processor_keys)
|
||||
|
||||
|
||||
class AnnotationFormatTestMixin:
|
||||
|
||||
Reference in New Issue
Block a user