LayoutLMv2Processor: ensure 1-to-1 mapping between images and samples in case of overflowing tokens (#17092)

* add get_overflowing_images function to ensure 1-to-1 mapping between samples and images in LayoutLMv2Processor

* make style

* add test for overflowing_tokens, change assert to ValueError, avoiding unrelated formatting changes

* change line length by passing --preview into black
This commit is contained in:
ghlai9665
2022-05-09 06:39:08 -05:00
committed by GitHub
parent 3212afa614
commit e9fd583ce0
2 changed files with 55 additions and 3 deletions

View File

@@ -133,6 +133,39 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
@slow
def test_overflowing_tokens(self):
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
def preprocess_data(examples):
images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
words = examples["words"]
boxes = examples["bboxes"]
word_labels = examples["ner_tags"]
encoded_inputs = processor(
images,
words,
boxes=boxes,
word_labels=word_labels,
padding="max_length",
truncation=True,
return_overflowing_tokens=True,
stride=50,
return_offsets_mapping=True,
return_tensors="pt",
)
return encoded_inputs
train_data = preprocess_data(datasets["train"])
self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
# different use cases tests
@require_torch