From e16775d103e4d0c04e72ba92586b808524c86f2d Mon Sep 17 00:00:00 2001 From: "Vinh H. Pham" Date: Mon, 14 Apr 2025 20:06:41 +0700 Subject: [PATCH] Add Fast Image Processor for LayoutLMv2 (#37203) * add support layoutlmv2 * make style * Apply suggestions from code review Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> * add warning and clean up * make style * Update src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> --- docs/source/en/model_doc/layoutlmv2.md | 5 + .../models/auto/image_processing_auto.py | 2 +- .../models/layoutlmv2/__init__.py | 1 + .../image_processing_layoutlmv2_fast.py | 164 ++++++++++++++++++ .../test_image_processing_layoutlmv2.py | 133 ++++++++++---- 5 files changed, 275 insertions(+), 30 deletions(-) create mode 100644 src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md index 7fc5ae3619..af20687571 100644 --- a/docs/source/en/model_doc/layoutlmv2.md +++ b/docs/source/en/model_doc/layoutlmv2.md @@ -310,6 +310,11 @@ print(encoding.keys()) [[autodoc]] LayoutLMv2ImageProcessor - preprocess +## LayoutLMv2ImageProcessorFast + +[[autodoc]] LayoutLMv2ImageProcessorFast + - preprocess + ## LayoutLMv2Tokenizer [[autodoc]] LayoutLMv2Tokenizer diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 5e8232d941..7e8ef1c750 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -102,7 +102,7 @@ else: ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")), ("instructblipvideo", ("InstructBlipVideoImageProcessor",)), ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), - ("layoutlmv2", ("LayoutLMv2ImageProcessor",)), + ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")), ("layoutlmv3", ("LayoutLMv3ImageProcessor",)), ("levit", ("LevitImageProcessor",)), ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")), diff --git a/src/transformers/models/layoutlmv2/__init__.py b/src/transformers/models/layoutlmv2/__init__.py index 5b529b9b5a..b68a523c0b 100644 --- a/src/transformers/models/layoutlmv2/__init__.py +++ b/src/transformers/models/layoutlmv2/__init__.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from .configuration_layoutlmv2 import * from .feature_extraction_layoutlmv2 import * from .image_processing_layoutlmv2 import * + from .image_processing_layoutlmv2_fast import * from .modeling_layoutlmv2 import * from .processing_layoutlmv2 import * from .tokenization_layoutlmv2 import * diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py new file mode 100644 index 0000000000..937ceed3a2 --- /dev/null +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for LayoutLMv2.""" + +from typing import Optional, Union + +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, + BaseImageProcessorFast, + BatchFeature, + DefaultFastImageProcessorKwargs, +) +from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images +from ...image_utils import ImageInput, PILImageResampling, SizeDict +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + add_start_docstrings, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, + requires_backends, +) +from .image_processing_layoutlmv2 import apply_tesseract + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + +if is_torchvision_available(): + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + apply_ocr: Optional[bool] + ocr_lang: Optional[str] + tesseract_config: Optional[str] + + +@add_start_docstrings( + "Constructs a fast LayoutLMv2 image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + """ + apply_ocr (`bool`, *optional*, defaults to `True`): + Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by + the `apply_ocr` parameter in the `preprocess` method. + ocr_lang (`str`, *optional*): + The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is + used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. + tesseract_config (`str`, *optional*): + Any additional custom configuration flags that are forwarded to the `config` parameter when calling + Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the + `preprocess` method. + """, +) +class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + size = {"height": 224, "width": 224} + rescale_factor = None + do_resize = True + apply_ocr = True + ocr_lang = None + tesseract_config = "" + valid_kwargs = LayoutLMv2FastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]): + super().__init__(**kwargs) + + @add_start_docstrings( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, + """ + apply_ocr (`bool`, *optional*, defaults to `True`): + Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by + the `apply_ocr` parameter in the `preprocess` method. + ocr_lang (`str`, *optional*): + The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is + used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. + tesseract_config (`str`, *optional*): + Any additional custom configuration flags that are forwarded to the `config` parameter when calling + Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the + `preprocess` method. + """, + ) + def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + apply_ocr: bool, + ocr_lang: Optional[str], + tesseract_config: Optional[str], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Tesseract OCR to get words + normalized bounding boxes + if apply_ocr: + requires_backends(self, "pytesseract") + words_batch = [] + boxes_batch = [] + for image in images: + if image.is_cuda: + logger.warning_once( + "apply_ocr can only be performed on cpu. Tensors will be transferred to cpu before processing." + ) + words, boxes = apply_tesseract( + image.cpu(), ocr_lang, tesseract_config, input_data_format=ChannelDimension.FIRST + ) + words_batch.append(words) + boxes_batch.append(boxes) + + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + # flip color channels from RGB to BGR (as Detectron2 requires this) + stacked_images = stacked_images.flip(1) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + data = BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + if apply_ocr: + data["words"] = words_batch + data["boxes"] = boxes_batch + + return data + + +__all__ = ["LayoutLMv2ImageProcessorFast"] diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py index 8bccb33217..01d0f45398 100644 --- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py @@ -12,20 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. - import unittest -from transformers.testing_utils import require_pytesseract, require_torch -from transformers.utils import is_pytesseract_available +import requests + +from transformers.testing_utils import require_pytesseract, require_torch, require_vision +from transformers.utils import is_pytesseract_available, is_torch_available, is_torchvision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs +if is_torch_available(): + import torch + if is_pytesseract_available(): from PIL import Image from transformers import LayoutLMv2ImageProcessor + if is_torchvision_available(): + from transformers import LayoutLMv2ImageProcessorFast + class LayoutLMv2ImageProcessingTester: def __init__( @@ -73,6 +80,9 @@ class LayoutLMv2ImageProcessingTester: @require_pytesseract class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None + fast_image_processing_class = ( + LayoutLMv2ImageProcessorFast if (is_torchvision_available() and is_pytesseract_available()) else None + ) def setUp(self): super().setUp() @@ -83,46 +93,111 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "apply_ocr")) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "apply_ocr")) def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) - image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) - self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) @unittest.skip(reason="Tesseract version is not correct in ci. @Arthur FIXME") def test_layoutlmv2_integration_test(self): - # with apply_OCR = True - image_processing = LayoutLMv2ImageProcessor() - from datasets import load_dataset ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) - image = Image.open(ds[0]["file"]).convert("RGB") + for image_processing_class in self.image_processor_list: + # with apply_OCR = True + image_processing = image_processing_class() - encoding = image_processing(image, return_tensors="pt") + image = Image.open(ds[0]["file"]).convert("RGB") - self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224)) - self.assertEqual(len(encoding.words), len(encoding.boxes)) + encoding = image_processing(image, return_tensors="pt") - # fmt: off - # the words and boxes were obtained with Tesseract 5.3.0 - expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']] # noqa: E231 - expected_boxes = [[[141, 57, 210, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [695, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231 - # fmt: on + self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224)) + self.assertEqual(len(encoding.words), len(encoding.boxes)) - self.assertListEqual(encoding.words, expected_words) - self.assertListEqual(encoding.boxes, expected_boxes) + # fmt: off + # the words and boxes were obtained with Tesseract 5.3.0 + expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']] # noqa: E231 + expected_boxes = [[[141, 57, 210, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [695, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231 + # fmt: on - # with apply_OCR = False - image_processing = LayoutLMv2ImageProcessor(apply_ocr=False) + self.assertListEqual(encoding.words, expected_words) + self.assertListEqual(encoding.boxes, expected_boxes) - encoding = image_processing(image, return_tensors="pt") + # with apply_OCR = False + image_processing = image_processing_class(apply_ocr=False) - self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224)) + encoding = image_processing(image, return_tensors="pt") + + self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224)) + + @require_vision + @require_torch + def test_slow_fast_equivalence(self): + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") + + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") + + dummy_image = Image.open( + requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw + ) + image_processor_slow = self.image_processing_class(**self.image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) + + encoding_slow = image_processor_slow(dummy_image, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_image, return_tensors="pt") + self.assertTrue( + torch.allclose( + encoding_slow.pixel_values.float() / 255, encoding_fast.pixel_values.float() / 255, atol=1e-1 + ) + ) + self.assertLessEqual( + torch.mean( + torch.abs(encoding_slow.pixel_values.float() - encoding_fast.pixel_values.float()) / 255 + ).item(), + 1e-3, + ) + + @require_vision + @require_torch + def test_slow_fast_equivalence_batched(self): + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") + + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") + + if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop: + self.skipTest( + reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors" + ) + + dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + image_processor_slow = self.image_processing_class(**self.image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) + + encoding_slow = image_processor_slow(dummy_images, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_images, return_tensors="pt") + + self.assertTrue( + torch.allclose( + encoding_slow.pixel_values.float() / 255, encoding_fast.pixel_values.float() / 255, atol=1e-1 + ) + ) + self.assertLessEqual( + torch.mean( + torch.abs(encoding_slow.pixel_values.float() - encoding_fast.pixel_values.float()) / 255 + ).item(), + 1e-3, + )