Add UDOP (#22940)

* First draft * More improvements * More improvements * More fixes * Fix copies * More improvements * More fixes * More improvements * Convert checkpoint * More improvements, set up tests * Fix more tests * Add UdopModel * More improvements * Fix equivalence test * More fixes * Redesign model * Extend conversion script * Use real inputs for conversion script * Add image processor * Improve conversion script * Add UdopTokenizer * Add fast tokenizer * Add converter * Update README's * Add processor * Add fully fledged tokenizer * Add fast tokenizer * Use processor in conversion script * Add tokenizer tests * Fix one more test * Fix more tests * Fix tokenizer tests * Enable fast tokenizer tests * Fix more tests * Fix additional_special_tokens of fast tokenizer * Fix tokenizer tests * Fix more tests * Fix equivalence test * Rename image to pixel_values * Rename seg_data to bbox * More renamings * Remove vis_special_token * More improvements * Add docs * Fix copied from * Update slow tokenizer * Update fast tokenizer design * Make text input optional * Add first draft of processor tests * Fix more processor tests * Fix decoder_start_token_id * Fix test_initialization * Add integration test * More improvements * Improve processor, add test * Add more copied from * Add more copied from * Add more copied from * Add more copied from * Remove print statement * Update README and auto mapping * Delete files * Delete another file * Remove code * Fix test * Fix docs * Remove asserts * Add doc tests * Include UDOP in exotic model tests * Add expected tesseract decodings * Add sentencepiece * Use same design as T5 * Add UdopEncoderModel * Add UdopEncoderModel to tests * More fixes * Fix fast tokenizer * Fix one more test * Remove parallelisable attribute * Fix copies * Remove legacy file * Copy from T5Tokenizer * Fix rebase * More fixes, copy from T5 * More fixes * Fix init * Use ArthurZ/udop for tests * Make all model tests pass * Remove UdopForConditionalGeneration from auto mapping * Fix more tests * fixups * more fixups * fix the tokenizers * remove un-necessary changes * nits * nits * replace truncate_sequences_boxes with truncate_sequences for fix-copies * nit current path * add a test for input ids * ids that we should get taken from c9f7a32f57440d90ff79890270d376a1cc0acb68 * nits converting * nits * apply ruff * nits * nits * style * fix slow order of addition * fix udop fast range as well * fixup * nits * Add docstrings * Fix gradient checkpointing * Update code examples * Skip tests * Update integration test * Address comment * Make fixup * Remove extra ids from tokenizer * Skip test * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update year * Address comment * Address more comments * Address comments * Add copied from * Update CI * Rename script * Update model id * Add AddedToken, skip tests * Update CI * Fix doc tests * Do not use Tesseract for the doc tests * Remove kwargs * Add original inputs * Update casting * Fix doc test * Update question * Update question * Use LayoutLMv3ImageProcessor * Update organization * Improve docs * Update forward signature * Make images optional * Remove deprecated device argument * Add comment, add add_prefix_space * More improvements * Remove kwargs --------- Co-authored-by: ArthurZucker <arthur.zucker@gmail.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2024-03-04 18:49:02 +01:00
parent ed74d97871
commit 836921fdeb
35 changed files with 8378 additions and 0 deletions
--- a/tests/models/udop/init.py
+++ b/tests/models/udop/init.py
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -0,0 +1,567 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import UdopConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor
+    from transformers.models.udop.modeling_udop import UDOP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class UdopModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        scope=None,
+        decoder_layers=None,
+        range_bbox=1000,
+        decoder_start_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+        self.range_bbox = range_bbox
+        self.decoder_start_token_id = decoder_start_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        bbox = ids_tensor([self.batch_size, self.encoder_seq_length, 4], self.range_bbox).float()
+        # Ensure that bbox is legal
+        for i in range(bbox.shape[0]):
+            for j in range(bbox.shape[1]):
+                if bbox[i, j, 3] < bbox[i, j, 1]:
+                    t = bbox[i, j, 3]
+                    bbox[i, j, 3] = bbox[i, j, 1]
+                    bbox[i, j, 1] = t
+                if bbox[i, j, 2] < bbox[i, j, 0]:
+                    t = bbox[i, j, 2]
+                    bbox[i, j, 2] = bbox[i, j, 0]
+                    bbox[i, j, 0] = t
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = self.get_config()
+
+        return (
+            config,
+            input_ids,
+            bbox,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def get_config(self):
+        return UdopConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        bbox,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UdopModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            bbox=bbox,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, bbox=bbox, decoder_input_ids=decoder_input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_with_lm_head(
+        self,
+        config,
+        input_ids,
+        bbox,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UdopForConditionalGeneration(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            bbox=bbox,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_generate_with_past_key_values(
+        self,
+        config,
+        input_ids,
+        bbox,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UdopForConditionalGeneration(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(
+            input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True
+        )
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            bbox,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "bbox": bbox,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            UdopModel,
+            UdopForConditionalGeneration,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (UdopForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": UdopModel} if is_torch_available() else {}
+    fx_compatible = False
+    test_pruning = False
+    test_torchscript = False
+    test_head_masking = False
+    test_resize_embeddings = True
+    test_model_parallel = False
+    is_encoder_decoder = True
+    # The small UDOP model needs higher percentages for CPU/MP tests
+    model_split_percents = [0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = UdopModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+        if model_class.__name__ == "UdopForConditionalGeneration":
+            if return_labels:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
+
+    def test_generate_with_past_key_values(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    @unittest.skip("Gradient checkpointing is not supported by this model")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = sorted([*signature.parameters.keys()])
+
+            expected_arg_names = [
+                "attention_mask",
+                "bbox",
+                "cross_attn_head_mask",
+                "decoder_attention_mask",
+                "decoder_head_mask",
+                "decoder_input_ids",
+                "decoder_inputs_embeds",
+                "encoder_outputs",
+                "head_mask",
+                "input_ids",
+                "inputs_embeds",
+            ]
+            if model_class in self.all_generative_model_classes:
+                expected_arg_names.append(
+                    "labels",
+                )
+                expected_arg_names = sorted(expected_arg_names)
+            self.assertListEqual(sorted(arg_names[: len(expected_arg_names)]), expected_arg_names)
+
+    @unittest.skip(
+        "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
+    )
+    def test_save_load_low_cpu_mem_usage(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in UDOP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = UdopForConditionalGeneration.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class UdopEncoderOnlyModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        seq_length=7,
+        # For common tests
+        is_training=False,
+        use_attention_mask=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        decoder_layers=2,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        scope=None,
+        range_bbox=1000,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        # For common tests
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.decoder_layers = decoder_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.scope = None
+        self.range_bbox = range_bbox
+
+    def get_config(self):
+        return UdopConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            is_encoder_decoder=False,
+        )
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox).float()
+        # Ensure that bbox is legal
+        for i in range(bbox.shape[0]):
+            for j in range(bbox.shape[1]):
+                if bbox[i, j, 3] < bbox[i, j, 1]:
+                    t = bbox[i, j, 3]
+                    bbox[i, j, 3] = bbox[i, j, 1]
+                    bbox[i, j, 1] = t
+                if bbox[i, j, 2] < bbox[i, j, 0]:
+                    t = bbox[i, j, 2]
+                    bbox[i, j, 2] = bbox[i, j, 0]
+                    bbox[i, j, 0] = t
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        config = self.get_config()
+
+        return (
+            config,
+            input_ids,
+            bbox,
+            attention_mask,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            bbox,
+            attention_mask,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "bbox": bbox,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        bbox,
+        attention_mask,
+    ):
+        model = UdopEncoderModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+        )
+        encoder_output = result.last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+    ):
+        model = UdopEncoderModel(config=config).to(torch_device).half().eval()
+        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+
+class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (UdopEncoderModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_head_masking = False
+    test_resize_embeddings = False
+    test_model_parallel = True
+    all_parallelizable_model_classes = (UdopEncoderModel,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = UdopEncoderOnlyModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    @unittest.skip(
+        "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
+    )
+    def test_save_load_low_cpu_mem_usage(self):
+        pass
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@require_vision
+@slow
+class UdopModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def image(self):
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
+        )
+        image = Image.open(filepath).convert("RGB")
+
+        return image
+
+    @cached_property
+    def processor(self):
+        return UdopProcessor.from_pretrained("microsoft/udop-large")
+
+    @cached_property
+    def model(self):
+        return UdopForConditionalGeneration.from_pretrained("microsoft/udop-large").to(torch_device)
+
+    def test_conditional_generation(self):
+        processor = self.processor
+        model = self.model
+
+        prompt = "Question answering. In which year is the report made?"
+        encoding = processor(images=self.image, text=prompt, return_tensors="pt")
+
+        predicted_ids = model.generate(**encoding)
+
+        predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        self.assertEquals(predicted_text, "2013")
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@@ -0,0 +1,508 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from typing import List
+
+import numpy as np
+
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
+from transformers.models.udop import UdopTokenizer, UdopTokenizerFast
+from transformers.testing_utils import (
+    require_pytesseract,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
+from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+if is_pytesseract_available():
+    from PIL import Image
+
+    from transformers import LayoutLMv3ImageProcessor, UdopProcessor
+
+
+@require_pytesseract
+@require_sentencepiece
+@require_tokenizers
+class UdopProcessorTest(unittest.TestCase):
+    tokenizer_class = UdopTokenizer
+    rust_tokenizer_class = UdopTokenizerFast
+    maxDiff = None
+
+    def setUp(self):
+        image_processor_map = {
+            "do_resize": True,
+            "size": 224,
+            "apply_ocr": True,
+        }
+
+        self.tmpdirname = tempfile.mkdtemp()
+        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(image_processor_map) + "\n")
+
+        self.tokenizer_pretrained_name = "microsoft/udop-large"
+
+    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
+        return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
+        return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
+
+    def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
+        return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
+
+    def get_image_processor(self, **kwargs):
+        return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_default(self):
+        image_processor = self.get_image_processor()
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+            processor.save_pretrained(self.tmpdirname)
+            processor = UdopProcessor.from_pretrained(self.tmpdirname)
+
+            self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+            self.assertIsInstance(processor.tokenizer, (UdopTokenizer, UdopTokenizerFast))
+
+            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
+            self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = UdopProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
+        processor.save_pretrained(self.tmpdirname)
+
+        # slow tokenizer
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
+
+        processor = UdopProcessor.from_pretrained(
+            self.tmpdirname,
+            use_fast=False,
+            bos_token="(BOS)",
+            eos_token="(EOS)",
+            do_resize=False,
+            size=30,
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, UdopTokenizer)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
+
+        # fast tokenizer
+        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
+
+        processor = UdopProcessor.from_pretrained(
+            self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, UdopTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = UdopProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    def test_text_target(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = UdopProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        text = "hello world"
+        expected_decoding = "hello world</s>"
+
+        encoding_processor = processor(text_target=text)
+        encoding_tokenizer = tokenizer(text_target=text)
+
+        self.assertListEqual(encoding_processor["input_ids"], [21820, 296, 1])
+        self.assertListEqual(encoding_processor["attention_mask"], [1, 1, 1])
+        self.assertDictEqual(dict(encoding_processor), dict(encoding_tokenizer))
+        self.assertEqual(tokenizer.decode(encoding_processor["input_ids"]), expected_decoding)
+
+    @slow
+    def test_overflowing_tokens(self):
+        # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
+
+        from datasets import load_dataset
+
+        # set up
+        datasets = load_dataset("nielsr/funsd")
+        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
+
+        def preprocess_data(examples):
+            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            words = examples["words"]
+            boxes = examples["bboxes"]
+            word_labels = examples["ner_tags"]
+            encoded_inputs = processor(
+                images,
+                words,
+                boxes=boxes,
+                word_labels=word_labels,
+                max_length=512,
+                padding="max_length",
+                truncation=True,
+                return_overflowing_tokens=True,
+                stride=50,
+                return_offsets_mapping=True,
+                return_tensors="pt",
+            )
+            return encoded_inputs
+
+        train_data = preprocess_data(datasets["train"])
+
+        self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"]))
+
+
+# different use cases tests
+@require_sentencepiece
+@require_torch
+@require_pytesseract
+class UdopProcessorIntegrationTests(unittest.TestCase):
+    @cached_property
+    def get_images(self):
+        # we verify our implementation on 2 document images from the DocVQA dataset
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+
+        image_1 = Image.open(ds[0]["file"]).convert("RGB")
+        image_2 = Image.open(ds[1]["file"]).convert("RGB")
+
+        return image_1, image_2
+
+    @cached_property
+    def get_tokenizers(self):
+        slow_tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
+        fast_tokenizer = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
+        return [slow_tokenizer, fast_tokenizer]
+
+    @slow
+    def test_processor_case_1(self):
+        # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
+
+        image_processor = LayoutLMv3ImageProcessor()
+        tokenizers = self.get_tokenizers
+        images = self.get_images
+
+        for tokenizer in tokenizers:
+            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+            # not batched
+            input_image_processor = image_processor(images[0], return_tensors="pt")
+            input_processor = processor(images[0], return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify pixel_values
+            self.assertTrue(
+                torch.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
+            )
+
+            # verify input_ids
+            # this was obtained with Tesseract 4.1.1
+            # fmt: off
+            expected_decoding = "11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # noqa: E231
+            # fmt: on
+            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # batched
+            input_image_processor = image_processor(images, return_tensors="pt")
+            input_processor = processor(images, padding=True, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify pixel_values
+            self.assertTrue(
+                torch.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
+            )
+
+            # verify input_ids
+            # this was obtained with Tesseract 4.1.1
+            # fmt: off
+            expected_decoding = "7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"  # noqa: E231
+            # fmt: on
+            decoding = processor.decode(input_processor.input_ids[1].tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+    @slow
+    def test_processor_case_2(self):
+        # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
+
+        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
+        tokenizers = self.get_tokenizers
+        images = self.get_images
+
+        for tokenizer in tokenizers:
+            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+            # not batched
+            words = ["hello", "world"]
+            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
+            input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = list(input_processor.keys())
+            for key in expected_keys:
+                self.assertIn(key, actual_keys)
+
+            # verify input_ids
+            expected_decoding = "hello world</s>"
+            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # batched
+            words = [["hello", "world"], ["my", "name", "is", "niels"]]
+            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
+            input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            expected_decoding = "hello world</s><pad><pad><pad><pad>"
+            decoding = processor.decode(input_processor.input_ids[0].tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # verify bbox
+            expected_bbox = [
+                [3, 2, 5, 1],
+                [6, 7, 4, 2],
+                [3, 9, 2, 4],
+                [1, 1, 2, 3],
+                [1, 1, 2, 3],
+                [1, 1, 2, 3],
+                [1000, 1000, 1000, 1000],
+            ]
+            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
+
+    @slow
+    def test_processor_case_3(self):
+        # case 3: token classification (training), apply_ocr=False
+
+        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
+        tokenizers = self.get_tokenizers
+        images = self.get_images
+
+        for tokenizer in tokenizers:
+            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+            # not batched
+            words = ["weirdly", "world"]
+            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
+            word_labels = [1, 2]
+            input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            expected_decoding = "weirdly world</s>"
+            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # verify labels
+            expected_labels = [1, -100, 2, -100]
+            self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
+
+            # batched
+            words = [["hello", "world"], ["my", "name", "is", "niels"]]
+            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
+            word_labels = [[1, 2], [6, 3, 10, 2]]
+            input_processor = processor(
+                images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
+            )
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            expected_decoding = "my name is niels</s>"
+            decoding = processor.decode(input_processor.input_ids[1].tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # verify bbox
+            expected_bbox = [
+                [3, 2, 5, 1],
+                [6, 7, 4, 2],
+                [3, 9, 2, 4],
+                [1, 1, 2, 3],
+                [1, 1, 2, 3],
+                [1, 1, 2, 3],
+                [1000, 1000, 1000, 1000],
+            ]
+            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
+
+            # verify labels
+            expected_labels = [6, 3, 10, 2, -100, -100, -100]
+            self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
+
+    @slow
+    def test_processor_case_4(self):
+        # case 4: visual question answering (inference), apply_ocr=True
+
+        image_processor = LayoutLMv3ImageProcessor()
+        tokenizers = self.get_tokenizers
+        images = self.get_images
+
+        for tokenizer in tokenizers:
+            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+            # not batched
+            question = "What's his name?"
+            input_processor = processor(images[0], question, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            # this was obtained with Tesseract 4.1.1
+            # fmt: off
+            expected_decoding = "What's his name?</s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # noqa: E231
+            # fmt: on
+            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # batched
+            questions = ["How old is he?", "what's the time"]
+            input_processor = processor(
+                images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
+            )
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            # this was obtained with Tesseract 4.1.1
+            expected_decoding = "what's the time</s> 7 ITC Limited REPORT AND ACCOUNTS 2013 I</s>"
+            decoding = processor.decode(input_processor.input_ids[1].tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # verify bbox
+            # fmt: off
+            expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [1000, 1000, 1000, 1000]]  # noqa: E231
+            # fmt: on
+            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
+
+    @slow
+    def test_processor_case_5(self):
+        # case 5: visual question answering (inference), apply_ocr=False
+
+        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
+        tokenizers = self.get_tokenizers
+        images = self.get_images
+
+        for tokenizer in tokenizers:
+            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+            # not batched
+            question = "What's his name?"
+            words = ["hello", "world"]
+            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
+            input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            expected_decoding = "What's his name?</s> hello world</s>"
+            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # batched
+            questions = ["How old is he?", "what's the time"]
+            words = [["hello", "world"], ["my", "name", "is", "niels"]]
+            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
+            input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
+
+            # verify keys
+            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
+            actual_keys = sorted(input_processor.keys())
+            self.assertListEqual(actual_keys, expected_keys)
+
+            # verify input_ids
+            expected_decoding = "How old is he?</s> hello world</s><pad><pad><pad>"
+            decoding = processor.decode(input_processor.input_ids[0].tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            expected_decoding = "what's the time</s> my name is niels</s>"
+            decoding = processor.decode(input_processor.input_ids[1].tolist())
+            self.assertSequenceEqual(decoding, expected_decoding)
+
+            # verify bbox
+            expected_bbox = [[3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
+            self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py