Vision processors - replace FE with IPs (#20590)
* Replace FE references with IPs * Update processor tests * Update src/transformers/models/clip/processing_clip.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/clip/processing_clip.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update warning messages v4.27 -> v5 * Fixup * Update Chinese CLIP processor * Add feature_extractor property * Add attributes * Add tests Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -287,8 +287,8 @@ class AutoProcessor:
|
|||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
|
f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
|
||||||
"tokenizer or a feature extractor for this model. Make sure the repository contains the files of at least "
|
"tokenizer, an image processor or a feature extractor for this model. Make sure the repository contains"
|
||||||
"one of those processing classes."
|
"the files of at least one of those processing classes."
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -15,39 +15,56 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for Chinese-CLIP
|
Image/Text processor class for Chinese-CLIP
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
|
|
||||||
|
|
||||||
class ChineseCLIPProcessor(ProcessorMixin):
|
class ChineseCLIPProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP feature extractor and a Chinese-CLIP tokenizer into
|
Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
|
||||||
a single processor.
|
single processor.
|
||||||
|
|
||||||
[`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPFeatureExtractor`] and
|
[`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
|
||||||
[`BertTokenizerFast`]. See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more
|
See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
|
||||||
information.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`ChineseCLIPFeatureExtractor`]):
|
image_processor ([`ChineseCLIPImageProcessor`]):
|
||||||
The feature extractor is a required input.
|
The image processor is a required input.
|
||||||
tokenizer ([`BertTokenizerFast`]):
|
tokenizer ([`BertTokenizerFast`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "ChineseCLIPFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "ChineseCLIPImageProcessor"
|
||||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
if "feature_extractor" in kwargs:
|
||||||
self.current_processor = self.feature_extractor
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
self.current_processor = self.image_processor
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||||
doctsring of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
@@ -84,7 +101,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
|
|||||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
|
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if text is not None and images is not None:
|
if text is not None and images is not None:
|
||||||
encoding["pixel_values"] = image_features.pixel_values
|
encoding["pixel_values"] = image_features.pixel_values
|
||||||
@@ -111,5 +128,13 @@ class ChineseCLIPProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
tokenizer_input_names = self.tokenizer.model_input_names
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||||||
feature_extractor_input_names = self.feature_extractor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|||||||
@@ -15,37 +15,54 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for CLIP
|
Image/Text processor class for CLIP
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
|
|
||||||
|
|
||||||
class CLIPProcessor(ProcessorMixin):
|
class CLIPProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
|
Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
|
||||||
|
|
||||||
[`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
|
[`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
|
||||||
[`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
|
[`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
image_processor ([`CLIPImageProcessor`]):
|
||||||
The feature extractor is a required input.
|
The image processor is a required input.
|
||||||
tokenizer ([`CLIPTokenizerFast`]):
|
tokenizer ([`CLIPTokenizerFast`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "CLIPFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "CLIPImageProcessor"
|
||||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
if "feature_extractor" in kwargs:
|
||||||
self.current_processor = self.feature_extractor
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||||
doctsring of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
@@ -82,7 +99,7 @@ class CLIPProcessor(ProcessorMixin):
|
|||||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
|
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if text is not None and images is not None:
|
if text is not None and images is not None:
|
||||||
encoding["pixel_values"] = image_features.pixel_values
|
encoding["pixel_values"] = image_features.pixel_values
|
||||||
@@ -109,5 +126,21 @@ class CLIPProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
tokenizer_input_names = self.tokenizer.model_input_names
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||||||
feature_extractor_input_names = self.feature_extractor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -15,38 +15,54 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for CLIPSeg
|
Image/Text processor class for CLIPSeg
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
|
|
||||||
|
|
||||||
class CLIPSegProcessor(ProcessorMixin):
|
class CLIPSegProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single
|
Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
|
||||||
processor.
|
|
||||||
|
|
||||||
[`CLIPSegProcessor`] offers all the functionalities of [`ViTFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
|
[`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
|
||||||
[`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
|
[`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`ViTFeatureExtractor`]):
|
image_processor ([`ViTImageProcessor`]):
|
||||||
The feature extractor is a required input.
|
The image processor is a required input.
|
||||||
tokenizer ([`CLIPTokenizerFast`]):
|
tokenizer ([`CLIPTokenizerFast`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "ViTFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "ViTImageProcessor"
|
||||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
if "feature_extractor" in kwargs:
|
||||||
self.current_processor = self.feature_extractor
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
ViTFeatureExtractor's [`~ViTFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
|
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
|
||||||
doctsring of the above two methods for more information.
|
the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
@@ -83,7 +99,7 @@ class CLIPSegProcessor(ProcessorMixin):
|
|||||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
|
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if text is not None and images is not None:
|
if text is not None and images is not None:
|
||||||
encoding["pixel_values"] = image_features.pixel_values
|
encoding["pixel_values"] = image_features.pixel_values
|
||||||
@@ -106,3 +122,19 @@ class CLIPSegProcessor(ProcessorMixin):
|
|||||||
the docstring of this method for more information.
|
the docstring of this method for more information.
|
||||||
"""
|
"""
|
||||||
return self.tokenizer.decode(*args, **kwargs)
|
return self.tokenizer.decode(*args, **kwargs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from ...processing_utils import ProcessorMixin
|
|||||||
|
|
||||||
class DonutProcessor(ProcessorMixin):
|
class DonutProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Donut processor which wraps a Donut feature extractor and an XLMRoBERTa tokenizer into a single
|
Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
|
||||||
processor.
|
processor.
|
||||||
|
|
||||||
[`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
|
[`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
|
||||||
@@ -32,8 +32,8 @@ class DonutProcessor(ProcessorMixin):
|
|||||||
[`~DonutProcessor.decode`] for more information.
|
[`~DonutProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`DonutFeatureExtractor`]):
|
image_processor ([`DonutFeatureExtractor`]):
|
||||||
An instance of [`DonutFeatureExtractor`]. The feature extractor is a required input.
|
An instance of [`DonutFeatureExtractor`]. The image processor is a required input.
|
||||||
tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
|
tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
|
||||||
An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
|
An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
@@ -44,7 +44,7 @@ class DonutProcessor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
if "feature_extractor" in kwargs:
|
if "feature_extractor" in kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
" instead.",
|
" instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
@@ -176,8 +176,15 @@ class DonutProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def feature_extractor_class(self):
|
def feature_extractor_class(self):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
" instead.",
|
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
return self.image_processor_class
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -15,6 +15,8 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for FLAVA
|
Image/Text processor class for FLAVA
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...image_utils import ImageInput
|
from ...image_utils import ImageInput
|
||||||
@@ -25,21 +27,36 @@ from ...utils import TensorType
|
|||||||
|
|
||||||
class FlavaProcessor(ProcessorMixin):
|
class FlavaProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a FLAVA processor which wraps a FLAVA feature extractor and a FLAVA tokenizer into a single processor.
|
Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
|
||||||
|
|
||||||
[`FlavaProcessor`] offers all the functionalities of [`FlavaFeatureExtractor`] and [`BertTokenizerFast`]. See the
|
[`FlavaProcessor`] offers all the functionalities of [`FlavaFeatureExtractor`] and [`BertTokenizerFast`]. See the
|
||||||
[`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
|
[`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`FlavaFeatureExtractor`]): The feature extractor is a required input.
|
image_processor ([`FlavaFeatureExtractor`]): The image processor is a required input.
|
||||||
tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
|
tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "FlavaFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "FlavaFeatureExtractor"
|
||||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
if "feature_extractor" in kwargs:
|
||||||
self.current_processor = self.feature_extractor
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
self.current_processor = self.image_processor
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@@ -93,7 +110,7 @@ class FlavaProcessor(ProcessorMixin):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_features = self.feature_extractor(
|
image_features = self.image_processor(
|
||||||
images,
|
images,
|
||||||
return_image_mask=return_image_mask,
|
return_image_mask=return_image_mask,
|
||||||
return_codebook_pixels=return_codebook_pixels,
|
return_codebook_pixels=return_codebook_pixels,
|
||||||
@@ -126,5 +143,21 @@ class FlavaProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
tokenizer_input_names = self.tokenizer.model_input_names
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||||||
feature_extractor_input_names = self.feature_extractor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -15,6 +15,8 @@
|
|||||||
"""
|
"""
|
||||||
Processor class for LayoutLMv2.
|
Processor class for LayoutLMv2.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
@@ -24,26 +26,44 @@ from ...utils import TensorType
|
|||||||
|
|
||||||
class LayoutLMv2Processor(ProcessorMixin):
|
class LayoutLMv2Processor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
|
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
|
||||||
single processor.
|
single processor.
|
||||||
|
|
||||||
[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
|
[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
|
||||||
|
|
||||||
It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
|
It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
|
||||||
to get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
|
get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
|
||||||
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
|
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
|
||||||
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
|
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
|
||||||
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
|
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor (`LayoutLMv2FeatureExtractor`):
|
image_processor (`LayoutLMv2ImageProcessor`):
|
||||||
An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
|
An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
|
||||||
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
|
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
|
||||||
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
|
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "LayoutLMv2FeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "LayoutLMv2ImageProcessor"
|
||||||
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
|
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
|
||||||
|
|
||||||
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
|
if "feature_extractor" in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
images,
|
images,
|
||||||
@@ -68,37 +88,36 @@ class LayoutLMv2Processor(ProcessorMixin):
|
|||||||
**kwargs
|
**kwargs
|
||||||
) -> BatchEncoding:
|
) -> BatchEncoding:
|
||||||
"""
|
"""
|
||||||
This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
|
This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
|
||||||
[`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
|
[`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
|
||||||
bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
|
bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
|
||||||
together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
|
together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
|
||||||
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
|
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
|
||||||
arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
|
arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
|
||||||
|
|
||||||
Please refer to the docstring of the above two methods for more information.
|
Please refer to the docstring of the above two methods for more information.
|
||||||
"""
|
"""
|
||||||
# verify input
|
# verify input
|
||||||
if self.feature_extractor.apply_ocr and (boxes is not None):
|
if self.image_processor.apply_ocr and (boxes is not None):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You cannot provide bounding boxes "
|
"You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
|
||||||
"if you initialized the feature extractor with apply_ocr set to True."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.feature_extractor.apply_ocr and (word_labels is not None):
|
if self.image_processor.apply_ocr and (word_labels is not None):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
|
"You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
|
||||||
)
|
)
|
||||||
|
|
||||||
if return_overflowing_tokens is True and return_offsets_mapping is False:
|
if return_overflowing_tokens is True and return_offsets_mapping is False:
|
||||||
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
|
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
|
||||||
|
|
||||||
# first, apply the feature extractor
|
# first, apply the image processor
|
||||||
features = self.feature_extractor(images=images, return_tensors=return_tensors)
|
features = self.image_processor(images=images, return_tensors=return_tensors)
|
||||||
|
|
||||||
# second, apply the tokenizer
|
# second, apply the tokenizer
|
||||||
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
|
if text is not None and self.image_processor.apply_ocr and text_pair is None:
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
|
text = [text] # add batch dimension (as the image processor always adds a batch dimension)
|
||||||
text_pair = features["words"]
|
text_pair = features["words"]
|
||||||
|
|
||||||
encoded_inputs = self.tokenizer(
|
encoded_inputs = self.tokenizer(
|
||||||
@@ -162,3 +181,19 @@ class LayoutLMv2Processor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
|
return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -15,6 +15,8 @@
|
|||||||
"""
|
"""
|
||||||
Processor class for LayoutLMv3.
|
Processor class for LayoutLMv3.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
@@ -24,26 +26,44 @@ from ...utils import TensorType
|
|||||||
|
|
||||||
class LayoutLMv3Processor(ProcessorMixin):
|
class LayoutLMv3Processor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 feature extractor and a LayoutLMv3 tokenizer into a
|
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
|
||||||
single processor.
|
single processor.
|
||||||
|
|
||||||
[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
|
[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
|
||||||
|
|
||||||
It first uses [`LayoutLMv3FeatureExtractor`] to resize and normalize document images, and optionally applies OCR to
|
It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
|
||||||
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
|
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
|
||||||
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
|
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
|
||||||
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
|
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
|
||||||
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
|
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor (`LayoutLMv3FeatureExtractor`):
|
image_processor (`LayoutLMv3ImageProcessor`):
|
||||||
An instance of [`LayoutLMv3FeatureExtractor`]. The feature extractor is a required input.
|
An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
|
||||||
tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
|
tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
|
||||||
An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
|
An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "LayoutLMv3FeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "LayoutLMv3ImageProcessor"
|
||||||
tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
|
tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
|
||||||
|
|
||||||
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
|
if "feature_extractor" in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
images,
|
images,
|
||||||
@@ -68,35 +88,34 @@ class LayoutLMv3Processor(ProcessorMixin):
|
|||||||
**kwargs
|
**kwargs
|
||||||
) -> BatchEncoding:
|
) -> BatchEncoding:
|
||||||
"""
|
"""
|
||||||
This method first forwards the `images` argument to [`~LayoutLMv3FeatureExtractor.__call__`]. In case
|
This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
|
||||||
[`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
|
[`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
|
||||||
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
|
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
|
||||||
together with resized and normalized `pixel_values`. In case [`LayoutLMv3FeatureExtractor`] was initialized
|
together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
|
||||||
with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user
|
`apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
|
||||||
along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
|
with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
|
||||||
resized and normalized `pixel_values`.
|
resized and normalized `pixel_values`.
|
||||||
|
|
||||||
Please refer to the docstring of the above two methods for more information.
|
Please refer to the docstring of the above two methods for more information.
|
||||||
"""
|
"""
|
||||||
# verify input
|
# verify input
|
||||||
if self.feature_extractor.apply_ocr and (boxes is not None):
|
if self.image_processor.apply_ocr and (boxes is not None):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You cannot provide bounding boxes "
|
"You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
|
||||||
"if you initialized the feature extractor with apply_ocr set to True."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.feature_extractor.apply_ocr and (word_labels is not None):
|
if self.image_processor.apply_ocr and (word_labels is not None):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
|
"You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
|
||||||
)
|
)
|
||||||
|
|
||||||
# first, apply the feature extractor
|
# first, apply the image processor
|
||||||
features = self.feature_extractor(images=images, return_tensors=return_tensors)
|
features = self.image_processor(images=images, return_tensors=return_tensors)
|
||||||
|
|
||||||
# second, apply the tokenizer
|
# second, apply the tokenizer
|
||||||
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
|
if text is not None and self.image_processor.apply_ocr and text_pair is None:
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
|
text = [text] # add batch dimension (as the image processor always adds a batch dimension)
|
||||||
text_pair = features["words"]
|
text_pair = features["words"]
|
||||||
|
|
||||||
encoded_inputs = self.tokenizer(
|
encoded_inputs = self.tokenizer(
|
||||||
@@ -160,3 +179,19 @@ class LayoutLMv3Processor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
return ["input_ids", "bbox", "attention_mask", "pixel_values"]
|
return ["input_ids", "bbox", "attention_mask", "pixel_values"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -81,10 +81,10 @@ def box_iou(boxes1, boxes2):
|
|||||||
|
|
||||||
class OwlViTImageProcessor(BaseImageProcessor):
|
class OwlViTImageProcessor(BaseImageProcessor):
|
||||||
r"""
|
r"""
|
||||||
Constructs an OWL-ViT feature extractor.
|
Constructs an OWL-ViT image processor.
|
||||||
|
|
||||||
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
|
This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should
|
||||||
should refer to this superclass for more information regarding those methods.
|
refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
@@ -115,7 +115,6 @@ class OwlViTImageProcessor(BaseImageProcessor):
|
|||||||
image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||||
The sequence of standard deviations for each channel, to be used when normalizing images.
|
The sequence of standard deviations for each channel, to be used when normalizing images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_input_names = ["pixel_values"]
|
model_input_names = ["pixel_values"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -139,7 +138,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
|
|||||||
crop_size = get_size_dict(crop_size, default_to_square=True)
|
crop_size = get_size_dict(crop_size, default_to_square=True)
|
||||||
|
|
||||||
# Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
|
# Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
|
||||||
# vision feature extractor method `rescale` as it would be set as an attribute during the super().__init__
|
# vision image processor method `rescale` as it would be set as an attribute during the super().__init__
|
||||||
# call. This is for backwards compatibility.
|
# call. This is for backwards compatibility.
|
||||||
if "rescale" in kwargs:
|
if "rescale" in kwargs:
|
||||||
rescale_val = kwargs.pop("rescale")
|
rescale_val = kwargs.pop("rescale")
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
Image/Text processor class for OWL-ViT
|
Image/Text processor class for OWL-ViT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -28,29 +29,44 @@ from ...tokenization_utils_base import BatchEncoding
|
|||||||
|
|
||||||
class OwlViTProcessor(ProcessorMixin):
|
class OwlViTProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs an OWL-ViT processor which wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
|
Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
|
||||||
into a single processor that interits both the feature extractor and tokenizer functionalities. See the
|
into a single processor that interits both the image processor and tokenizer functionalities. See the
|
||||||
[`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
|
[`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`OwlViTFeatureExtractor`]):
|
image_processor ([`OwlViTImageProcessor`]):
|
||||||
The image processor is a required input.
|
The image processor is a required input.
|
||||||
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
|
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "OwlViTFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "OwlViTImageProcessor"
|
||||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
if "feature_extractor" in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
|
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||||
doctsring of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
@@ -137,13 +153,13 @@ class OwlViTProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
if query_images is not None:
|
if query_images is not None:
|
||||||
encoding = BatchEncoding()
|
encoding = BatchEncoding()
|
||||||
query_pixel_values = self.feature_extractor(
|
query_pixel_values = self.image_processor(
|
||||||
query_images, return_tensors=return_tensors, **kwargs
|
query_images, return_tensors=return_tensors, **kwargs
|
||||||
).pixel_values
|
).pixel_values
|
||||||
encoding["query_pixel_values"] = query_pixel_values
|
encoding["query_pixel_values"] = query_pixel_values
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
|
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if text is not None and images is not None:
|
if text is not None and images is not None:
|
||||||
encoding["pixel_values"] = image_features.pixel_values
|
encoding["pixel_values"] = image_features.pixel_values
|
||||||
@@ -158,17 +174,17 @@ class OwlViTProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
def post_process(self, *args, **kwargs):
|
def post_process(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process`]. Please refer to the
|
This method forwards all its arguments to [`OwlViTImageProcessor.post_process`]. Please refer to the docstring
|
||||||
docstring of this method for more information.
|
of this method for more information.
|
||||||
"""
|
"""
|
||||||
return self.feature_extractor.post_process(*args, **kwargs)
|
return self.image_processor.post_process(*args, **kwargs)
|
||||||
|
|
||||||
def post_process_image_guided_detection(self, *args, **kwargs):
|
def post_process_image_guided_detection(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process_one_shot_object_detection`].
|
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
|
||||||
Please refer to the docstring of this method for more information.
|
Please refer to the docstring of this method for more information.
|
||||||
"""
|
"""
|
||||||
return self.feature_extractor.post_process_image_guided_detection(*args, **kwargs)
|
return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -183,3 +199,19 @@ class OwlViTProcessor(ProcessorMixin):
|
|||||||
the docstring of this method for more information.
|
the docstring of this method for more information.
|
||||||
"""
|
"""
|
||||||
return self.tokenizer.decode(*args, **kwargs)
|
return self.tokenizer.decode(*args, **kwargs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class TrOCRProcessor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
if "feature_extractor" in kwargs:
|
if "feature_extractor" in kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
" instead.",
|
" instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
@@ -124,8 +124,15 @@ class TrOCRProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def feature_extractor_class(self):
|
def feature_extractor_class(self):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
" instead.",
|
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
return self.image_processor_class
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
Processor class for ViLT.
|
Processor class for ViLT.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
@@ -25,23 +26,38 @@ from ...utils import TensorType
|
|||||||
|
|
||||||
class ViltProcessor(ProcessorMixin):
|
class ViltProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
|
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
|
||||||
|
|
||||||
[`ViltProcessor`] offers all the functionalities of [`ViltFeatureExtractor`] and [`BertTokenizerFast`]. See the
|
[`ViltProcessor`] offers all the functionalities of [`ViltFeatureExtractor`] and [`BertTokenizerFast`]. See the
|
||||||
docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
|
docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor (`ViltFeatureExtractor`):
|
image_processor (`ViltFeatureExtractor`):
|
||||||
An instance of [`ViltFeatureExtractor`]. The feature extractor is a required input.
|
An instance of [`ViltFeatureExtractor`]. The image processor is a required input.
|
||||||
tokenizer (`BertTokenizerFast`):
|
tokenizer (`BertTokenizerFast`):
|
||||||
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
|
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "ViltFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "ViltFeatureExtractor"
|
||||||
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
if "feature_extractor" in kwargs:
|
||||||
self.current_processor = self.feature_extractor
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
self.current_processor = self.image_processor
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@@ -88,8 +104,8 @@ class ViltProcessor(ProcessorMixin):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
# add pixel_values + pixel_mask
|
# add pixel_values + pixel_mask
|
||||||
encoding_feature_extractor = self.feature_extractor(images, return_tensors=return_tensors)
|
encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
|
||||||
encoding.update(encoding_feature_extractor)
|
encoding.update(encoding_image_processor)
|
||||||
|
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
@@ -110,5 +126,21 @@ class ViltProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
tokenizer_input_names = self.tokenizer.model_input_names
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||||||
feature_extractor_input_names = self.feature_extractor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
|
|||||||
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
if "feature_extractor" in kwargs:
|
if "feature_extractor" in kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
" instead.",
|
" instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
@@ -132,10 +132,18 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
|
|||||||
image_processor_input_names = self.image_processor.model_input_names
|
image_processor_input_names = self.image_processor.model_input_names
|
||||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||||
|
|
||||||
|
@property
|
||||||
def feature_extractor_class(self):
|
def feature_extractor_class(self):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
" instead.",
|
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
return self.image_processor_class
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -15,38 +15,55 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for XCLIP
|
Image/Text processor class for XCLIP
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
|
|
||||||
|
|
||||||
class XCLIPProcessor(ProcessorMixin):
|
class XCLIPProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs an X-CLIP processor which wraps a VideoMAE feature extractor and a CLIP tokenizer into a single
|
Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
|
||||||
processor.
|
|
||||||
|
|
||||||
[`XCLIPProcessor`] offers all the functionalities of [`VideoMAEFeatureExtractor`] and [`CLIPTokenizerFast`]. See
|
[`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
|
||||||
the [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
|
[`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`VideoMAEFeatureExtractor`]):
|
image_processor ([`VideoMAEImageProcessor`]):
|
||||||
The feature extractor is a required input.
|
The image processor is a required input.
|
||||||
tokenizer ([`CLIPTokenizerFast`]):
|
tokenizer ([`CLIPTokenizerFast`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "VideoMAEFeatureExtractor"
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "VideoMAEImageProcessor"
|
||||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
if "feature_extractor" in kwargs:
|
||||||
self.current_processor = self.feature_extractor
|
warnings.warn(
|
||||||
|
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
|
||||||
|
" instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
feature_extractor = kwargs.pop("feature_extractor")
|
||||||
|
|
||||||
|
image_processor = image_processor if image_processor is not None else feature_extractor
|
||||||
|
if image_processor is None:
|
||||||
|
raise ValueError("You need to specify an `image_processor`.")
|
||||||
|
if tokenizer is None:
|
||||||
|
raise ValueError("You need to specify a `tokenizer`.")
|
||||||
|
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
self.current_processor = self.image_processor
|
||||||
|
|
||||||
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
|
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||||
VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `videos` is not `None`. Please refer to
|
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
|
||||||
the doctsring of the above two methods for more information.
|
doctsring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
@@ -84,7 +101,7 @@ class XCLIPProcessor(ProcessorMixin):
|
|||||||
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if videos is not None:
|
if videos is not None:
|
||||||
image_features = self.feature_extractor(videos, return_tensors=return_tensors, **kwargs)
|
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
|
||||||
|
|
||||||
if text is not None and videos is not None:
|
if text is not None and videos is not None:
|
||||||
encoding["pixel_values"] = image_features.pixel_values
|
encoding["pixel_values"] = image_features.pixel_values
|
||||||
@@ -111,3 +128,19 @@ class XCLIPProcessor(ProcessorMixin):
|
|||||||
@property
|
@property
|
||||||
def model_input_names(self):
|
def model_input_names(self):
|
||||||
return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
|
return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor_class(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor_class
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_extractor(self):
|
||||||
|
warnings.warn(
|
||||||
|
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.image_processor
|
||||||
|
|||||||
@@ -158,7 +158,8 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
<Tip>
|
<Tip>
|
||||||
|
|
||||||
This class method is simply calling the feature extractor
|
This class method is simply calling the feature extractor
|
||||||
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
|
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
|
||||||
|
[`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
|
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
|
||||||
methods above for more information.
|
methods above for more information.
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
|
|||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import ChineseCLIPFeatureExtractor, ChineseCLIPProcessor
|
from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor
|
||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@@ -62,7 +62,7 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"do_resize": True,
|
"do_resize": True,
|
||||||
"size": {"height": 224, "width": 224},
|
"size": {"height": 224, "width": 224},
|
||||||
"do_center_crop": True,
|
"do_center_crop": True,
|
||||||
@@ -72,9 +72,9 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
||||||
"do_convert_rgb": True,
|
"do_convert_rgb": True,
|
||||||
}
|
}
|
||||||
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.image_processor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||||
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
|
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
|
||||||
json.dump(feature_extractor_map, fp)
|
json.dump(image_processor_map, fp)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
def get_tokenizer(self, **kwargs):
|
||||||
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -82,8 +82,8 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
def get_rust_tokenizer(self, **kwargs):
|
def get_rust_tokenizer(self, **kwargs):
|
||||||
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return ChineseCLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return ChineseCLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -102,13 +102,13 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer_slow = self.get_tokenizer()
|
tokenizer_slow = self.get_tokenizer()
|
||||||
tokenizer_fast = self.get_rust_tokenizer()
|
tokenizer_fast = self.get_rust_tokenizer()
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
|
|
||||||
processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
|
processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
|
||||||
processor_slow.save_pretrained(self.tmpdirname)
|
processor_slow.save_pretrained(self.tmpdirname)
|
||||||
processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||||
|
|
||||||
processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
|
processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
|
||||||
processor_fast.save_pretrained(self.tmpdirname)
|
processor_fast.save_pretrained(self.tmpdirname)
|
||||||
processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
|
processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
@@ -118,19 +118,17 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
|
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
|
||||||
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
|
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor_slow.feature_extractor, ChineseCLIPFeatureExtractor)
|
self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor)
|
||||||
self.assertIsInstance(processor_fast.feature_extractor, ChineseCLIPFeatureExtractor)
|
self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = ChineseCLIPProcessor(
|
processor = ChineseCLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
|
||||||
tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
|
|
||||||
)
|
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
|
tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False)
|
image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
|
||||||
|
|
||||||
processor = ChineseCLIPProcessor.from_pretrained(
|
processor = ChineseCLIPProcessor.from_pretrained(
|
||||||
self.tmpdirname, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
|
self.tmpdirname, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
|
||||||
@@ -139,28 +137,28 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
|
self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, ChineseCLIPFeatureExtractor)
|
self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor)
|
||||||
|
|
||||||
def test_feature_extractor(self):
|
def test_image_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_feat_extract = feature_extractor(image_input, return_tensors="np")
|
input_feat_extract = image_processor(image_input, return_tensors="np")
|
||||||
input_processor = processor(images=image_input, return_tensors="np")
|
input_processor = processor(images=image_input, return_tensors="np")
|
||||||
|
|
||||||
for key in input_feat_extract.keys():
|
for key in input_feat_extract.keys():
|
||||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|
||||||
def test_tokenizer(self):
|
def test_tokenizer(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "Alexandra,T-shirt的价格是15便士。"
|
input_str = "Alexandra,T-shirt的价格是15便士。"
|
||||||
|
|
||||||
@@ -172,10 +170,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||||
|
|
||||||
def test_processor(self):
|
def test_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "Alexandra,T-shirt的价格是15便士。"
|
input_str = "Alexandra,T-shirt的价格是15便士。"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -189,10 +187,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
processor()
|
processor()
|
||||||
|
|
||||||
def test_tokenizer_decode(self):
|
def test_tokenizer_decode(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||||
|
|
||||||
@@ -202,10 +200,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(decoded_tok, decoded_processor)
|
self.assertListEqual(decoded_tok, decoded_processor)
|
||||||
|
|
||||||
def test_model_input_names(self):
|
def test_model_input_names(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "Alexandra,T-shirt的价格是15便士。"
|
input_str = "Alexandra,T-shirt的价格是15便士。"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|||||||
@@ -24,13 +24,13 @@ import pytest
|
|||||||
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
||||||
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_vision
|
from transformers.testing_utils import require_vision
|
||||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
|
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import CLIPFeatureExtractor, CLIPProcessor
|
from transformers import CLIPImageProcessor, CLIPProcessor
|
||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@@ -52,7 +52,7 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"do_resize": True,
|
"do_resize": True,
|
||||||
"size": 20,
|
"size": 20,
|
||||||
"do_center_crop": True,
|
"do_center_crop": True,
|
||||||
@@ -61,9 +61,9 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
"image_mean": [0.48145466, 0.4578275, 0.40821073],
|
"image_mean": [0.48145466, 0.4578275, 0.40821073],
|
||||||
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
||||||
}
|
}
|
||||||
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
|
||||||
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
|
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
|
||||||
json.dump(feature_extractor_map, fp)
|
json.dump(image_processor_map, fp)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
def get_tokenizer(self, **kwargs):
|
||||||
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -71,8 +71,8 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
def get_rust_tokenizer(self, **kwargs):
|
def get_rust_tokenizer(self, **kwargs):
|
||||||
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return CLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return CLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -91,13 +91,13 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer_slow = self.get_tokenizer()
|
tokenizer_slow = self.get_tokenizer()
|
||||||
tokenizer_fast = self.get_rust_tokenizer()
|
tokenizer_fast = self.get_rust_tokenizer()
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
|
|
||||||
processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
|
processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
|
||||||
processor_slow.save_pretrained(self.tmpdirname)
|
processor_slow.save_pretrained(self.tmpdirname)
|
||||||
processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||||
|
|
||||||
processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
|
processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
|
||||||
processor_fast.save_pretrained(self.tmpdirname)
|
processor_fast.save_pretrained(self.tmpdirname)
|
||||||
processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname)
|
processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
@@ -107,17 +107,17 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
|
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
|
||||||
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
|
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor_slow.feature_extractor, CLIPFeatureExtractor)
|
self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
|
||||||
self.assertIsInstance(processor_fast.feature_extractor, CLIPFeatureExtractor)
|
self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = CLIPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
|
||||||
|
|
||||||
processor = CLIPProcessor.from_pretrained(
|
processor = CLIPProcessor.from_pretrained(
|
||||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||||
@@ -126,28 +126,28 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
|
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
|
self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
|
||||||
|
|
||||||
def test_feature_extractor(self):
|
def test_image_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_feat_extract = feature_extractor(image_input, return_tensors="np")
|
input_image_proc = image_processor(image_input, return_tensors="np")
|
||||||
input_processor = processor(images=image_input, return_tensors="np")
|
input_processor = processor(images=image_input, return_tensors="np")
|
||||||
|
|
||||||
for key in input_feat_extract.keys():
|
for key in input_image_proc.keys():
|
||||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|
||||||
def test_tokenizer(self):
|
def test_tokenizer(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
|
|
||||||
@@ -159,10 +159,10 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||||
|
|
||||||
def test_processor(self):
|
def test_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -176,10 +176,10 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
processor()
|
processor()
|
||||||
|
|
||||||
def test_tokenizer_decode(self):
|
def test_tokenizer_decode(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||||
|
|
||||||
@@ -189,10 +189,10 @@ class CLIPProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(decoded_tok, decoded_processor)
|
self.assertListEqual(decoded_tok, decoded_processor)
|
||||||
|
|
||||||
def test_model_input_names(self):
|
def test_model_input_names(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|||||||
@@ -24,13 +24,13 @@ import pytest
|
|||||||
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
||||||
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_vision
|
from transformers.testing_utils import require_vision
|
||||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
|
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import CLIPSegProcessor, ViTFeatureExtractor
|
from transformers import CLIPSegProcessor, ViTImageProcessor
|
||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@@ -52,7 +52,7 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"do_resize": True,
|
"do_resize": True,
|
||||||
"size": 20,
|
"size": 20,
|
||||||
"do_center_crop": True,
|
"do_center_crop": True,
|
||||||
@@ -61,9 +61,9 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
"image_mean": [0.48145466, 0.4578275, 0.40821073],
|
"image_mean": [0.48145466, 0.4578275, 0.40821073],
|
||||||
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
||||||
}
|
}
|
||||||
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
|
||||||
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
|
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
|
||||||
json.dump(feature_extractor_map, fp)
|
json.dump(image_processor_map, fp)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
def get_tokenizer(self, **kwargs):
|
||||||
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -71,8 +71,8 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
def get_rust_tokenizer(self, **kwargs):
|
def get_rust_tokenizer(self, **kwargs):
|
||||||
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -90,13 +90,13 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer_slow = self.get_tokenizer()
|
tokenizer_slow = self.get_tokenizer()
|
||||||
tokenizer_fast = self.get_rust_tokenizer()
|
tokenizer_fast = self.get_rust_tokenizer()
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
|
|
||||||
processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
|
processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
|
||||||
processor_slow.save_pretrained(self.tmpdirname)
|
processor_slow.save_pretrained(self.tmpdirname)
|
||||||
processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||||
|
|
||||||
processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
|
processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
|
||||||
processor_fast.save_pretrained(self.tmpdirname)
|
processor_fast.save_pretrained(self.tmpdirname)
|
||||||
processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
|
processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
@@ -106,17 +106,17 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
|
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
|
||||||
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
|
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor_slow.feature_extractor, ViTFeatureExtractor)
|
self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor)
|
||||||
self.assertIsInstance(processor_fast.feature_extractor, ViTFeatureExtractor)
|
self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
|
||||||
|
|
||||||
processor = CLIPSegProcessor.from_pretrained(
|
processor = CLIPSegProcessor.from_pretrained(
|
||||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||||
@@ -125,28 +125,28 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
|
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, ViTFeatureExtractor)
|
self.assertIsInstance(processor.image_processor, ViTImageProcessor)
|
||||||
|
|
||||||
def test_feature_extractor(self):
|
def test_image_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_feat_extract = feature_extractor(image_input, return_tensors="np")
|
input_feat_extract = image_processor(image_input, return_tensors="np")
|
||||||
input_processor = processor(images=image_input, return_tensors="np")
|
input_processor = processor(images=image_input, return_tensors="np")
|
||||||
|
|
||||||
for key in input_feat_extract.keys():
|
for key in input_feat_extract.keys():
|
||||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|
||||||
def test_tokenizer(self):
|
def test_tokenizer(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
|
|
||||||
@@ -158,10 +158,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||||
|
|
||||||
def test_processor(self):
|
def test_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -175,10 +175,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
|
|||||||
processor()
|
processor()
|
||||||
|
|
||||||
def test_tokenizer_decode(self):
|
def test_tokenizer_decode(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||||
|
|
||||||
|
|||||||
@@ -25,13 +25,13 @@ import pytest
|
|||||||
from transformers import BertTokenizer, BertTokenizerFast
|
from transformers import BertTokenizer, BertTokenizerFast
|
||||||
from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
|
from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_vision
|
from transformers.testing_utils import require_vision
|
||||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
|
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import FlavaFeatureExtractor, FlavaProcessor
|
from transformers import FlavaImageProcessor, FlavaProcessor
|
||||||
from transformers.models.flava.image_processing_flava import (
|
from transformers.models.flava.image_processing_flava import (
|
||||||
FLAVA_CODEBOOK_MEAN,
|
FLAVA_CODEBOOK_MEAN,
|
||||||
FLAVA_CODEBOOK_STD,
|
FLAVA_CODEBOOK_STD,
|
||||||
@@ -53,7 +53,7 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("".join([x + "\n" for x in vocab_tokens]))
|
fp.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"image_mean": FLAVA_IMAGE_MEAN,
|
"image_mean": FLAVA_IMAGE_MEAN,
|
||||||
"image_std": FLAVA_IMAGE_STD,
|
"image_std": FLAVA_IMAGE_STD,
|
||||||
"do_normalize": True,
|
"do_normalize": True,
|
||||||
@@ -77,9 +77,9 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
"codebook_image_std": FLAVA_CODEBOOK_STD,
|
"codebook_image_std": FLAVA_CODEBOOK_STD,
|
||||||
}
|
}
|
||||||
|
|
||||||
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
|
||||||
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
|
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
|
||||||
json.dump(feature_extractor_map, fp)
|
json.dump(image_processor_map, fp)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
def get_tokenizer(self, **kwargs):
|
||||||
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -87,8 +87,8 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
def get_rust_tokenizer(self, **kwargs):
|
def get_rust_tokenizer(self, **kwargs):
|
||||||
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return FlavaFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -107,13 +107,13 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer_slow = self.get_tokenizer()
|
tokenizer_slow = self.get_tokenizer()
|
||||||
tokenizer_fast = self.get_rust_tokenizer()
|
tokenizer_fast = self.get_rust_tokenizer()
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
|
|
||||||
processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
|
processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
|
||||||
processor_slow.save_pretrained(self.tmpdirname)
|
processor_slow.save_pretrained(self.tmpdirname)
|
||||||
processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||||
|
|
||||||
processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
|
processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
|
||||||
processor_fast.save_pretrained(self.tmpdirname)
|
processor_fast.save_pretrained(self.tmpdirname)
|
||||||
processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
|
processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
@@ -123,17 +123,17 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
|
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
|
||||||
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
|
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor_slow.feature_extractor, FlavaFeatureExtractor)
|
self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor)
|
||||||
self.assertIsInstance(processor_fast.feature_extractor, FlavaFeatureExtractor)
|
self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = FlavaProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
|
||||||
|
|
||||||
processor = FlavaProcessor.from_pretrained(
|
processor = FlavaProcessor.from_pretrained(
|
||||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||||
@@ -142,18 +142,18 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
|
self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, FlavaFeatureExtractor)
|
self.assertIsInstance(processor.image_processor, FlavaImageProcessor)
|
||||||
|
|
||||||
def test_feature_extractor(self):
|
def test_image_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_feat_extract = feature_extractor(image_input, return_tensors="np")
|
input_feat_extract = image_processor(image_input, return_tensors="np")
|
||||||
input_processor = processor(images=image_input, return_tensors="np")
|
input_processor = processor(images=image_input, return_tensors="np")
|
||||||
|
|
||||||
for key in input_feat_extract.keys():
|
for key in input_feat_extract.keys():
|
||||||
@@ -161,7 +161,7 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
|
|
||||||
# With rest of the args
|
# With rest of the args
|
||||||
random.seed(1234)
|
random.seed(1234)
|
||||||
input_feat_extract = feature_extractor(
|
input_feat_extract = image_processor(
|
||||||
image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
|
image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
|
||||||
)
|
)
|
||||||
random.seed(1234)
|
random.seed(1234)
|
||||||
@@ -173,10 +173,10 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|
||||||
def test_tokenizer(self):
|
def test_tokenizer(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
|
|
||||||
@@ -188,10 +188,10 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||||
|
|
||||||
def test_processor(self):
|
def test_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -220,10 +220,10 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
processor()
|
processor()
|
||||||
|
|
||||||
def test_tokenizer_decode(self):
|
def test_tokenizer_decode(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||||
|
|
||||||
@@ -233,10 +233,10 @@ class FlavaProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(decoded_tok, decoded_processor)
|
self.assertListEqual(decoded_tok, decoded_processor)
|
||||||
|
|
||||||
def test_model_input_names(self):
|
def test_model_input_names(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
|
|||||||
if is_pytesseract_available():
|
if is_pytesseract_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
|
from transformers import LayoutLMv2ImageProcessor, LayoutLMv2Processor
|
||||||
|
|
||||||
|
|
||||||
@require_pytesseract
|
@require_pytesseract
|
||||||
@@ -59,7 +59,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"do_resize": True,
|
"do_resize": True,
|
||||||
"size": 224,
|
"size": 224,
|
||||||
"apply_ocr": True,
|
"apply_ocr": True,
|
||||||
@@ -69,9 +69,9 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.image_processing_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||||
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
with open(self.image_processing_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(feature_extractor_map) + "\n")
|
fp.write(json.dumps(image_processor_map) + "\n")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -82,8 +82,8 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
||||||
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
|
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -100,10 +100,10 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
return image_inputs
|
return image_inputs
|
||||||
|
|
||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizers = self.get_tokenizers()
|
tokenizers = self.get_tokenizers()
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
|
processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
|
||||||
@@ -111,16 +111,16 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
|
self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
|
processor = LayoutLMv2Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
# slow tokenizer
|
# slow tokenizer
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
|
||||||
|
|
||||||
processor = LayoutLMv2Processor.from_pretrained(
|
processor = LayoutLMv2Processor.from_pretrained(
|
||||||
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
||||||
@@ -129,12 +129,12 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
|
self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
|
||||||
|
|
||||||
# fast tokenizer
|
# fast tokenizer
|
||||||
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
|
||||||
|
|
||||||
processor = LayoutLMv2Processor.from_pretrained(
|
processor = LayoutLMv2Processor.from_pretrained(
|
||||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
||||||
@@ -143,14 +143,14 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
|
self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
|
||||||
|
|
||||||
def test_model_input_names(self):
|
def test_model_input_names(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = LayoutLMv2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = LayoutLMv2Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -220,15 +220,15 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_1(self):
|
def test_processor_case_1(self):
|
||||||
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
|
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
|
||||||
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor()
|
image_processor = LayoutLMv2ImageProcessor()
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
|
input_image_proc = image_processor(images[0], return_tensors="pt")
|
||||||
input_processor = processor(images[0], return_tensors="pt")
|
input_processor = processor(images[0], return_tensors="pt")
|
||||||
|
|
||||||
# verify keys
|
# verify keys
|
||||||
@@ -237,9 +237,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
self.assertListEqual(actual_keys, expected_keys)
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
# verify image
|
# verify image
|
||||||
self.assertAlmostEqual(
|
self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
|
||||||
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
|
|
||||||
)
|
|
||||||
|
|
||||||
# verify input_ids
|
# verify input_ids
|
||||||
# this was obtained with Tesseract 4.1.1
|
# this was obtained with Tesseract 4.1.1
|
||||||
@@ -250,7 +248,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
self.assertSequenceEqual(decoding, expected_decoding)
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
# batched
|
# batched
|
||||||
input_feat_extract = feature_extractor(images, return_tensors="pt")
|
input_image_proc = image_processor(images, return_tensors="pt")
|
||||||
input_processor = processor(images, padding=True, return_tensors="pt")
|
input_processor = processor(images, padding=True, return_tensors="pt")
|
||||||
|
|
||||||
# verify keys
|
# verify keys
|
||||||
@@ -259,9 +257,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
self.assertListEqual(actual_keys, expected_keys)
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
# verify images
|
# verify images
|
||||||
self.assertAlmostEqual(
|
self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
|
||||||
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
|
|
||||||
)
|
|
||||||
|
|
||||||
# verify input_ids
|
# verify input_ids
|
||||||
# this was obtained with Tesseract 4.1.1
|
# this was obtained with Tesseract 4.1.1
|
||||||
@@ -275,12 +271,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_2(self):
|
def test_processor_case_2(self):
|
||||||
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
|
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
|
||||||
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
words = ["hello", "world"]
|
words = ["hello", "world"]
|
||||||
@@ -329,12 +325,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_3(self):
|
def test_processor_case_3(self):
|
||||||
# case 3: token classification (training), apply_ocr=False
|
# case 3: token classification (training), apply_ocr=False
|
||||||
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
words = ["weirdly", "world"]
|
words = ["weirdly", "world"]
|
||||||
@@ -394,12 +390,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_4(self):
|
def test_processor_case_4(self):
|
||||||
# case 4: visual question answering (inference), apply_ocr=True
|
# case 4: visual question answering (inference), apply_ocr=True
|
||||||
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor()
|
image_processor = LayoutLMv2ImageProcessor()
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
question = "What's his name?"
|
question = "What's his name?"
|
||||||
@@ -445,12 +441,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_5(self):
|
def test_processor_case_5(self):
|
||||||
# case 5: visual question answering (inference), apply_ocr=False
|
# case 5: visual question answering (inference), apply_ocr=False
|
||||||
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
question = "What's his name?"
|
question = "What's his name?"
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
|
|||||||
if is_pytesseract_available():
|
if is_pytesseract_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3Processor
|
from transformers import LayoutLMv3ImageProcessor, LayoutLMv3Processor
|
||||||
|
|
||||||
|
|
||||||
@require_pytesseract
|
@require_pytesseract
|
||||||
@@ -76,7 +76,7 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"do_resize": True,
|
"do_resize": True,
|
||||||
"size": 224,
|
"size": 224,
|
||||||
"apply_ocr": True,
|
"apply_ocr": True,
|
||||||
@@ -84,7 +84,7 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||||
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(feature_extractor_map) + "\n")
|
fp.write(json.dumps(image_processor_map) + "\n")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
@@ -95,8 +95,8 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
||||||
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
|
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return LayoutLMv3FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -113,10 +113,10 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
return image_inputs
|
return image_inputs
|
||||||
|
|
||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizers = self.get_tokenizers()
|
tokenizers = self.get_tokenizers()
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
processor = LayoutLMv3Processor.from_pretrained(self.tmpdirname)
|
processor = LayoutLMv3Processor.from_pretrained(self.tmpdirname)
|
||||||
@@ -124,16 +124,16 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, (LayoutLMv3Tokenizer, LayoutLMv3TokenizerFast))
|
self.assertIsInstance(processor.tokenizer, (LayoutLMv3Tokenizer, LayoutLMv3TokenizerFast))
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv3FeatureExtractor)
|
self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = LayoutLMv3Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
|
processor = LayoutLMv3Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
# slow tokenizer
|
# slow tokenizer
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
|
||||||
|
|
||||||
processor = LayoutLMv3Processor.from_pretrained(
|
processor = LayoutLMv3Processor.from_pretrained(
|
||||||
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
||||||
@@ -142,12 +142,12 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, LayoutLMv3Tokenizer)
|
self.assertIsInstance(processor.tokenizer, LayoutLMv3Tokenizer)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv3FeatureExtractor)
|
self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
|
||||||
|
|
||||||
# fast tokenizer
|
# fast tokenizer
|
||||||
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
|
||||||
|
|
||||||
processor = LayoutLMv3Processor.from_pretrained(
|
processor = LayoutLMv3Processor.from_pretrained(
|
||||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
||||||
@@ -156,14 +156,14 @@ class LayoutLMv3ProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, LayoutLMv3TokenizerFast)
|
self.assertIsInstance(processor.tokenizer, LayoutLMv3TokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv3FeatureExtractor)
|
self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
|
||||||
|
|
||||||
def test_model_input_names(self):
|
def test_model_input_names(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = LayoutLMv3Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = LayoutLMv3Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -200,15 +200,15 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_1(self):
|
def test_processor_case_1(self):
|
||||||
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
|
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
|
||||||
|
|
||||||
feature_extractor = LayoutLMv3FeatureExtractor()
|
image_processor = LayoutLMv3ImageProcessor()
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
|
input_image_proc = image_processor(images[0], return_tensors="pt")
|
||||||
input_processor = processor(images[0], return_tensors="pt")
|
input_processor = processor(images[0], return_tensors="pt")
|
||||||
|
|
||||||
# verify keys
|
# verify keys
|
||||||
@@ -218,7 +218,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify image
|
# verify image
|
||||||
self.assertAlmostEqual(
|
self.assertAlmostEqual(
|
||||||
input_feat_extract["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
|
input_image_proc["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
|
||||||
)
|
)
|
||||||
|
|
||||||
# verify input_ids
|
# verify input_ids
|
||||||
@@ -230,7 +230,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
self.assertSequenceEqual(decoding, expected_decoding)
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
# batched
|
# batched
|
||||||
input_feat_extract = feature_extractor(images, return_tensors="pt")
|
input_image_proc = image_processor(images, return_tensors="pt")
|
||||||
input_processor = processor(images, padding=True, return_tensors="pt")
|
input_processor = processor(images, padding=True, return_tensors="pt")
|
||||||
|
|
||||||
# verify keys
|
# verify keys
|
||||||
@@ -240,7 +240,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
# verify images
|
# verify images
|
||||||
self.assertAlmostEqual(
|
self.assertAlmostEqual(
|
||||||
input_feat_extract["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
|
input_image_proc["pixel_values"].sum(), input_processor["pixel_values"].sum(), delta=1e-2
|
||||||
)
|
)
|
||||||
|
|
||||||
# verify input_ids
|
# verify input_ids
|
||||||
@@ -255,12 +255,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_2(self):
|
def test_processor_case_2(self):
|
||||||
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
|
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
|
||||||
|
|
||||||
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
|
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
words = ["hello", "world"]
|
words = ["hello", "world"]
|
||||||
@@ -309,12 +309,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_3(self):
|
def test_processor_case_3(self):
|
||||||
# case 3: token classification (training), apply_ocr=False
|
# case 3: token classification (training), apply_ocr=False
|
||||||
|
|
||||||
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
|
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
words = ["weirdly", "world"]
|
words = ["weirdly", "world"]
|
||||||
@@ -374,12 +374,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_4(self):
|
def test_processor_case_4(self):
|
||||||
# case 4: visual question answering (inference), apply_ocr=True
|
# case 4: visual question answering (inference), apply_ocr=True
|
||||||
|
|
||||||
feature_extractor = LayoutLMv3FeatureExtractor()
|
image_processor = LayoutLMv3ImageProcessor()
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
question = "What's his name?"
|
question = "What's his name?"
|
||||||
@@ -425,12 +425,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
|
|||||||
def test_processor_case_5(self):
|
def test_processor_case_5(self):
|
||||||
# case 5: visual question answering (inference), apply_ocr=False
|
# case 5: visual question answering (inference), apply_ocr=False
|
||||||
|
|
||||||
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
|
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
|
||||||
tokenizers = self.get_tokenizers
|
tokenizers = self.get_tokenizers
|
||||||
images = self.get_images
|
images = self.get_images
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
processor = LayoutLMv3Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
|
||||||
# not batched
|
# not batched
|
||||||
question = "What's his name?"
|
question = "What's his name?"
|
||||||
|
|||||||
@@ -24,13 +24,13 @@ import pytest
|
|||||||
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
||||||
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_vision
|
from transformers.testing_utils import require_vision
|
||||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
|
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from transformers import OwlViTFeatureExtractor, OwlViTProcessor
|
from transformers import OwlViTImageProcessor, OwlViTProcessor
|
||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@@ -52,7 +52,7 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
feature_extractor_map = {
|
image_processor_map = {
|
||||||
"do_resize": True,
|
"do_resize": True,
|
||||||
"size": 20,
|
"size": 20,
|
||||||
"do_center_crop": True,
|
"do_center_crop": True,
|
||||||
@@ -61,9 +61,9 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
"image_mean": [0.48145466, 0.4578275, 0.40821073],
|
"image_mean": [0.48145466, 0.4578275, 0.40821073],
|
||||||
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
"image_std": [0.26862954, 0.26130258, 0.27577711],
|
||||||
}
|
}
|
||||||
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
|
||||||
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
|
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
|
||||||
json.dump(feature_extractor_map, fp)
|
json.dump(image_processor_map, fp)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
def get_tokenizer(self, **kwargs):
|
||||||
return CLIPTokenizer.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
|
return CLIPTokenizer.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
|
||||||
@@ -71,8 +71,8 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
def get_rust_tokenizer(self, **kwargs):
|
def get_rust_tokenizer(self, **kwargs):
|
||||||
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
|
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
|
||||||
|
|
||||||
def get_feature_extractor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return OwlViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
return OwlViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
@@ -91,13 +91,13 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer_slow = self.get_tokenizer()
|
tokenizer_slow = self.get_tokenizer()
|
||||||
tokenizer_fast = self.get_rust_tokenizer()
|
tokenizer_fast = self.get_rust_tokenizer()
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
|
|
||||||
processor_slow = OwlViTProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
|
processor_slow = OwlViTProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
|
||||||
processor_slow.save_pretrained(self.tmpdirname)
|
processor_slow.save_pretrained(self.tmpdirname)
|
||||||
processor_slow = OwlViTProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
processor_slow = OwlViTProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||||
|
|
||||||
processor_fast = OwlViTProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
|
processor_fast = OwlViTProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
|
||||||
processor_fast.save_pretrained(self.tmpdirname)
|
processor_fast.save_pretrained(self.tmpdirname)
|
||||||
processor_fast = OwlViTProcessor.from_pretrained(self.tmpdirname)
|
processor_fast = OwlViTProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
@@ -107,17 +107,17 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
|
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
|
||||||
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
|
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
|
||||||
self.assertIsInstance(processor_slow.feature_extractor, OwlViTFeatureExtractor)
|
self.assertIsInstance(processor_slow.image_processor, OwlViTImageProcessor)
|
||||||
self.assertIsInstance(processor_fast.feature_extractor, OwlViTFeatureExtractor)
|
self.assertIsInstance(processor_fast.image_processor, OwlViTImageProcessor)
|
||||||
|
|
||||||
def test_save_load_pretrained_additional_features(self):
|
def test_save_load_pretrained_additional_features(self):
|
||||||
processor = OwlViTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
processor = OwlViTProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
|
||||||
processor.save_pretrained(self.tmpdirname)
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False)
|
image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
|
||||||
|
|
||||||
processor = OwlViTProcessor.from_pretrained(
|
processor = OwlViTProcessor.from_pretrained(
|
||||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False
|
||||||
@@ -126,28 +126,28 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
|
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
|
||||||
|
|
||||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||||
self.assertIsInstance(processor.feature_extractor, OwlViTFeatureExtractor)
|
self.assertIsInstance(processor.image_processor, OwlViTImageProcessor)
|
||||||
|
|
||||||
def test_feature_extractor(self):
|
def test_image_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|
||||||
input_feat_extract = feature_extractor(image_input, return_tensors="np")
|
input_image_proc = image_processor(image_input, return_tensors="np")
|
||||||
input_processor = processor(images=image_input, return_tensors="np")
|
input_processor = processor(images=image_input, return_tensors="np")
|
||||||
|
|
||||||
for key in input_feat_extract.keys():
|
for key in input_image_proc.keys():
|
||||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|
||||||
def test_tokenizer(self):
|
def test_tokenizer(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
|
|
||||||
@@ -159,10 +159,10 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(encoded_tok[key][0].tolist(), encoded_processor[key][0].tolist())
|
self.assertListEqual(encoded_tok[key][0].tolist(), encoded_processor[key][0].tolist())
|
||||||
|
|
||||||
def test_processor(self):
|
def test_processor(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
input_str = "lower newer"
|
input_str = "lower newer"
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -228,10 +228,10 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
self.assertListEqual(list(input_ids[1]), predicted_ids[1])
|
self.assertListEqual(list(input_ids[1]), predicted_ids[1])
|
||||||
|
|
||||||
def test_processor_case2(self):
|
def test_processor_case2(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
query_input = self.prepare_image_inputs()
|
query_input = self.prepare_image_inputs()
|
||||||
@@ -245,10 +245,10 @@ class OwlViTProcessorTest(unittest.TestCase):
|
|||||||
processor()
|
processor()
|
||||||
|
|
||||||
def test_tokenizer_decode(self):
|
def test_tokenizer_decode(self):
|
||||||
feature_extractor = self.get_feature_extractor()
|
image_processor = self.get_image_processor()
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
processor = OwlViTProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||||
|
|
||||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user