From 505854f78f61c5254c75e3ffbcf3d7c7fadf65fb Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:02:50 +0200
Subject: [PATCH] [UDOP] Improve docs, add resources (#29571)

* Improve docs

* Add more tips
---
 docs/source/en/model_doc/udop.md              | 17 ++++++++++++++---
 src/transformers/models/udop/modeling_udop.py | 17 +++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md
index b84ec160f7..614bd2ff4f 100644
--- a/docs/source/en/model_doc/udop.md
+++ b/docs/source/en/model_doc/udop.md
@@ -56,14 +56,25 @@ image = Image.open(name_of_your_document).convert("RGB")
 width, height = image.size
 ```
 
+One can use [`UdopProcessor`] to prepare images and text for the model, which takes care of all of this. By default, this class uses the Tesseract engine to extract a list of words and boxes (coordinates) from a given document. Its functionality is equivalent to that of [`LayoutLMv3Processor`], hence it supports passing either `apply_ocr=False` in case you prefer to use your own OCR engine or `apply_ocr=True` in case you want the default OCR engine to be used. Refer to the [usage guide of LayoutLMv2](layoutlmv2#usage-layoutlmv2processor) regarding all possible use cases (the functionality of `UdopProcessor` is identical).
+
+- If using an own OCR engine of choice, one recommendation is Azure's [Read API](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/how-to/call-read-api), which supports so-called line segments. Use of segment position embeddings typically results in better performance.
 - At inference time, it's recommended to use the `generate` method to autoregressively generate text given a document image.
-- One can use [`UdopProcessor`] to prepare images and text for the model. By default, this class uses the Tesseract engine to extract a list of words
-and boxes (coordinates) from a given document. Its functionality is equivalent to that of [`LayoutLMv3Processor`], hence it supports passing either
-`apply_ocr=False` in case you prefer to use your own OCR engine or `apply_ocr=True` in case you want the default OCR engine to be used.
+- The model has been pre-trained on both self-supervised and supervised objectives. One can use the various task prefixes (prompts) used during pre-training to test out the out-of-the-box capabilities. For instance, the model can be prompted with "Question answering. What is the date?", as "Question answering." is the task prefix used during pre-training for DocVQA. Refer to the [paper](https://arxiv.org/abs/2212.02623) (table 1) for all task prefixes.
+- One can also fine-tune [`UdopEncoderModel`], which is the encoder-only part of UDOP, which can be seen as a LayoutLMv3-like Transformer encoder. For discriminative tasks, one can just add a linear classifier on top of it and fine-tune it on a labeled dataset.
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/microsoft/UDOP).
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UDOP. If
+you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll
+review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- Demo notebooks regarding UDOP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UDOP) that show how
+to fine-tune UDOP on a custom dataset as well as inference. 🌎
+- [Document question answering task guide](../tasks/document_question_answering)
 
 ## UdopConfig
 
diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 0d817a324b..9d12d9cc2e 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -1597,9 +1597,14 @@ class UdopModel(UdopPreTrainedModel):
         >>> from datasets import load_dataset
         >>> import torch
 
+        >>> # load model and processor
+        >>> # in this case, we already have performed OCR ourselves
+        >>> # so we initialize the processor with `apply_ocr=False`
         >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
         >>> model = AutoModel.from_pretrained("microsoft/udop-large")
 
+        >>> # load an example image, along with the words and coordinates
+        >>> # which were extracted using an OCR engine
         >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
         >>> example = dataset[0]
         >>> image = example["image"]
@@ -1772,14 +1777,21 @@ class UdopForConditionalGeneration(UdopPreTrainedModel):
         >>> from datasets import load_dataset
 
         >>> # load model and processor
+        >>> # in this case, we already have performed OCR ourselves
+        >>> # so we initialize the processor with `apply_ocr=False`
         >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
         >>> model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
 
+        >>> # load an example image, along with the words and coordinates
+        >>> # which were extracted using an OCR engine
         >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
         >>> example = dataset[0]
         >>> image = example["image"]
         >>> words = example["tokens"]
         >>> boxes = example["bboxes"]
+
+        >>> # one can use the various task prefixes (prompts) used during pre-training
+        >>> # e.g. the task prefix for DocVQA is "Question answering. "
         >>> question = "Question answering. What is the date on the form?"
         >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
 
@@ -1992,9 +2004,14 @@ class UdopEncoderModel(UdopPreTrainedModel):
         >>> from huggingface_hub import hf_hub_download
         >>> from datasets import load_dataset
 
+        >>> # load model and processor
+        >>> # in this case, we already have performed OCR ourselves
+        >>> # so we initialize the processor with `apply_ocr=False`
         >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
         >>> model = UdopEncoderModel.from_pretrained("microsoft/udop-large")
 
+        >>> # load an example image, along with the words and coordinates
+        >>> # which were extracted using an OCR engine
         >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
         >>> example = dataset[0]
         >>> image = example["image"]