From a98f6a1da012ca7847e4dceb3ffcedfd75a77b08 Mon Sep 17 00:00:00 2001 From: anthony2261 Date: Tue, 30 Aug 2022 15:43:14 +0300 Subject: [PATCH] LayoutXLMProcessor: ensure 1-to-1 mapping between samples and images, and add test for it (#18774) --- .../models/layoutxlm/processing_layoutxlm.py | 3 ++ .../layoutxlm/test_processor_layoutxlm.py | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py index 03423d17c2..da75398493 100644 --- a/src/transformers/models/layoutxlm/processing_layoutxlm.py +++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py @@ -89,6 +89,9 @@ class LayoutXLMProcessor(ProcessorMixin): "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True." ) + if return_overflowing_tokens is True and return_offsets_mapping is False: + raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.") + # first, apply the feature extractor features = self.feature_extractor(images=images, return_tensors=return_tensors) diff --git a/tests/models/layoutxlm/test_processor_layoutxlm.py b/tests/models/layoutxlm/test_processor_layoutxlm.py index d0d7eec28a..2752bd16a8 100644 --- a/tests/models/layoutxlm/test_processor_layoutxlm.py +++ b/tests/models/layoutxlm/test_processor_layoutxlm.py @@ -126,6 +126,40 @@ class LayoutXLMProcessorTest(unittest.TestCase): self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) + @slow + def test_overflowing_tokens(self): + # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences). + + from datasets import load_dataset + + # set up + datasets = load_dataset("nielsr/funsd") + processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False) + + def preprocess_data(examples): + images = [Image.open(path).convert("RGB") for path in examples["image_path"]] + words = examples["words"] + boxes = examples["bboxes"] + word_labels = examples["ner_tags"] + encoded_inputs = processor( + images, + words, + boxes=boxes, + word_labels=word_labels, + max_length=512, + padding="max_length", + truncation=True, + return_overflowing_tokens=True, + stride=50, + return_offsets_mapping=True, + return_tensors="pt", + ) + return encoded_inputs + + train_data = preprocess_data(datasets["train"]) + + self.assertEqual(len(train_data["image"]), len(train_data["input_ids"])) + # different use cases tests @require_sentencepiece