From 505854f78f61c5254c75e3ffbcf3d7c7fadf65fb Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:02:50 +0200 Subject: [PATCH] [UDOP] Improve docs, add resources (#29571) * Improve docs * Add more tips --- docs/source/en/model_doc/udop.md | 17 ++++++++++++++--- src/transformers/models/udop/modeling_udop.py | 17 +++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md index b84ec160f7..614bd2ff4f 100644 --- a/docs/source/en/model_doc/udop.md +++ b/docs/source/en/model_doc/udop.md @@ -56,14 +56,25 @@ image = Image.open(name_of_your_document).convert("RGB") width, height = image.size ``` +One can use [`UdopProcessor`] to prepare images and text for the model, which takes care of all of this. By default, this class uses the Tesseract engine to extract a list of words and boxes (coordinates) from a given document. Its functionality is equivalent to that of [`LayoutLMv3Processor`], hence it supports passing either `apply_ocr=False` in case you prefer to use your own OCR engine or `apply_ocr=True` in case you want the default OCR engine to be used. Refer to the [usage guide of LayoutLMv2](layoutlmv2#usage-layoutlmv2processor) regarding all possible use cases (the functionality of `UdopProcessor` is identical). + +- If using an own OCR engine of choice, one recommendation is Azure's [Read API](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/how-to/call-read-api), which supports so-called line segments. Use of segment position embeddings typically results in better performance. - At inference time, it's recommended to use the `generate` method to autoregressively generate text given a document image. -- One can use [`UdopProcessor`] to prepare images and text for the model. By default, this class uses the Tesseract engine to extract a list of words -and boxes (coordinates) from a given document. Its functionality is equivalent to that of [`LayoutLMv3Processor`], hence it supports passing either -`apply_ocr=False` in case you prefer to use your own OCR engine or `apply_ocr=True` in case you want the default OCR engine to be used. +- The model has been pre-trained on both self-supervised and supervised objectives. One can use the various task prefixes (prompts) used during pre-training to test out the out-of-the-box capabilities. For instance, the model can be prompted with "Question answering. What is the date?", as "Question answering." is the task prefix used during pre-training for DocVQA. Refer to the [paper](https://arxiv.org/abs/2212.02623) (table 1) for all task prefixes. +- One can also fine-tune [`UdopEncoderModel`], which is the encoder-only part of UDOP, which can be seen as a LayoutLMv3-like Transformer encoder. For discriminative tasks, one can just add a linear classifier on top of it and fine-tune it on a labeled dataset. This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/UDOP). +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UDOP. If +you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll +review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +- Demo notebooks regarding UDOP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UDOP) that show how +to fine-tune UDOP on a custom dataset as well as inference. 🌎 +- [Document question answering task guide](../tasks/document_question_answering) ## UdopConfig diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 0d817a324b..9d12d9cc2e 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1597,9 +1597,14 @@ class UdopModel(UdopPreTrainedModel): >>> from datasets import load_dataset >>> import torch + >>> # load model and processor + >>> # in this case, we already have performed OCR ourselves + >>> # so we initialize the processor with `apply_ocr=False` >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) >>> model = AutoModel.from_pretrained("microsoft/udop-large") + >>> # load an example image, along with the words and coordinates + >>> # which were extracted using an OCR engine >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") >>> example = dataset[0] >>> image = example["image"] @@ -1772,14 +1777,21 @@ class UdopForConditionalGeneration(UdopPreTrainedModel): >>> from datasets import load_dataset >>> # load model and processor + >>> # in this case, we already have performed OCR ourselves + >>> # so we initialize the processor with `apply_ocr=False` >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) >>> model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large") + >>> # load an example image, along with the words and coordinates + >>> # which were extracted using an OCR engine >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] >>> boxes = example["bboxes"] + + >>> # one can use the various task prefixes (prompts) used during pre-training + >>> # e.g. the task prefix for DocVQA is "Question answering. " >>> question = "Question answering. What is the date on the form?" >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt") @@ -1992,9 +2004,14 @@ class UdopEncoderModel(UdopPreTrainedModel): >>> from huggingface_hub import hf_hub_download >>> from datasets import load_dataset + >>> # load model and processor + >>> # in this case, we already have performed OCR ourselves + >>> # so we initialize the processor with `apply_ocr=False` >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) >>> model = UdopEncoderModel.from_pretrained("microsoft/udop-large") + >>> # load an example image, along with the words and coordinates + >>> # which were extracted using an OCR engine >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") >>> example = dataset[0] >>> image = example["image"]