Correct order of overflowing tokens for LayoutLmV2 tokenizer (#13495)

* correct order of overflowing tokens for LayoutLmV2 tokenizer * test to check order of overflowing_tokens for a seq of input_ids * fix up quality * added suggested changes * check that tests the bbox sequence * pair_input test added * pass quality test * check bbox sequence added * unittest method * comments added * add overflowing bbox test * improved "seq_1" Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> * improve code quality Co-authored-by: SaulLu <lucilesaul.com@gmail.com> Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com>
2021-11-09 18:19:53 +05:30
parent 95b3ec3bc9
commit 6326aa4bf0
3 changed files with 551 additions and 35 deletions
--- a/tests/test_tokenization_layoutlmv2.py
+++ b/tests/test_tokenization_layoutlmv2.py
@@ -15,6 +15,7 @@

 import inspect
 import os
+import re
 import shutil
 import tempfile
 import unittest
@@ -1777,13 +1778,515 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def test_alignement_methods(self):
        pass

-    @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
+        toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
+        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
+        toks = list(
+            filter(
+                lambda t: [t[0]]
+                == tokenizer.encode(t[1].split(" "), boxes=len(t[1]) * [[1, 1, 1, 1]], add_special_tokens=False),
+                toks,
+            )
+        )
+        if max_length is not None and len(toks) > max_length:
+            toks = toks[:max_length]
+        if min_length is not None and len(toks) < min_length and len(toks) > 0:
+            while len(toks) < min_length:
+                toks = toks + toks
+        # toks_str = [t[1] for t in toks]
+        toks_ids = [t[0] for t in toks]

-    @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
+        # Ensure consistency
+        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
+        if " " not in output_txt and len(toks_ids) > 1:
+            output_txt = (
+                tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
+                + " "
+                + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
+            )
+        if with_prefix_space:
+            output_txt = " " + output_txt
+        words = output_txt.split(" ")
+        boxes = [[i, i, i, i] for i in range(len(words))]
+        output_ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
+
+        return words, boxes, output_ids
+
+    # @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
+    def test_maximum_encoding_length_pair_input(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Build a sequence from our model's vocabulary
+                stride = 2
+                seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
+                question_0 = " ".join(map(str, seq_0))
+                if len(ids) <= 2 + stride:
+                    seq_0 = (seq_0 + " ") * (2 + stride)
+                    ids = None
+
+                seq0_tokens = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
+                self.assertGreater(len(seq0_tokens["input_ids"]), 2 + stride)
+                question_1 = "This is another sentence to be encoded."
+                seq_1 = ["what", "a", "weird", "test", "weirdly", "weird"]
+                boxes_1 = [[i, i, i, i] for i in range(len(seq_1))]
+                seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
+                if abs(len(seq0_tokens["input_ids"]) - len(seq1_tokens["input_ids"])) <= 2:
+                    seq1_tokens_input_ids = seq1_tokens["input_ids"] + seq1_tokens["input_ids"]
+                    seq_1 = tokenizer.decode(seq1_tokens_input_ids, clean_up_tokenization_spaces=False)
+                    seq_1 = seq_1.split(" ")
+                    boxes_1 = [[i, i, i, i] for i in range(len(seq_1))]
+                seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
+
+                self.assertGreater(len(seq1_tokens["input_ids"]), 2 + stride)
+
+                smallest = (
+                    seq1_tokens["input_ids"]
+                    if len(seq0_tokens["input_ids"]) > len(seq1_tokens["input_ids"])
+                    else seq0_tokens["input_ids"]
+                )
+
+                # We are not using the special tokens - a bit too hard to test all the tokenizers with this
+                # TODO try this again later
+                sequence = tokenizer(
+                    question_0, seq_1, boxes=boxes_1, add_special_tokens=False
+                )  # , add_prefix_space=False)
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_2 = seq_0 * model_max_length
+                question_2 = " ".join(map(str, seq_2))
+                boxes_2 = boxes_0 * model_max_length
+                self.assertGreater(len(seq_2), model_max_length)
+
+                sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                sequence2 = tokenizer(question_2, seq_1, boxes=boxes_1, add_special_tokens=False)
+                total_length2 = len(sequence2["input_ids"])
+                self.assertLess(total_length1, model_max_length, "Issue with the testing sequence, please update it.")
+                self.assertGreater(
+                    total_length2, model_max_length, "Issue with the testing sequence, please update it."
+                )
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
+                                output = tokenizer(
+                                    question_2,
+                                    seq_1,
+                                    boxes=boxes_1,
+                                    padding=padding_state,
+                                    truncation=truncation_state,
+                                )
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+                                self.assertEqual(len(output["bbox"]), model_max_length)
+
+                                output = tokenizer(
+                                    [question_2],
+                                    [seq_1],
+                                    boxes=[boxes_1],
+                                    padding=padding_state,
+                                    truncation=truncation_state,
+                                )
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+                                self.assertEqual(len(output["bbox"][0]), model_max_length)
+
+                        # Simple
+                        output = tokenizer(
+                            question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation="only_second"
+                        )
+                        self.assertEqual(len(output["input_ids"]), model_max_length)
+                        self.assertEqual(len(output["bbox"]), model_max_length)
+
+                        output = tokenizer(
+                            [question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation="only_second"
+                        )
+                        self.assertEqual(len(output["input_ids"][0]), model_max_length)
+                        self.assertEqual(len(output["bbox"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        # Reset warnings
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer(
+                                question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation=False
+                            )
+                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
+                            self.assertNotEqual(len(output["bbox"]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer(
+                                [question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation=False
+                            )
+                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+                            self.assertNotEqual(len(output["bbox"][0]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+                # Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
+                truncated_first_sequence = (
+                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][:-2]
+                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"]
+                )
+                truncated_second_sequence = (
+                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"]
+                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][:-2]
+                )
+                truncated_longest_sequence = (
+                    truncated_first_sequence if len(seq0_tokens) > len(seq1_tokens) else truncated_second_sequence
+                )
+
+                overflow_first_sequence = (
+                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][-(2 + stride) :]
+                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"]
+                )
+                overflow_second_sequence = (
+                    tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"]
+                    + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][-(2 + stride) :]
+                )
+                overflow_longest_sequence = (
+                    overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence
+                )
+
+                bbox_first = [[0, 0, 0, 0]] * (len(seq_0) - 2)
+                bbox_first_sequence = bbox_first + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"]
+                overflowing_token_bbox_first_sequence_slow = [[0, 0, 0, 0]] * (2 + stride)
+                overflowing_token_bbox_first_sequence_fast = [[0, 0, 0, 0]] * (2 + stride) + tokenizer(
+                    seq_1, boxes=boxes_1, add_special_tokens=False
+                )["bbox"]
+
+                bbox_second = [[0, 0, 0, 0]] * len(seq_0)
+                bbox_second_sequence = (
+                    bbox_second + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"][:-2]
+                )
+                overflowing_token_bbox_second_sequence_slow = tokenizer(
+                    seq_1, boxes=boxes_1, add_special_tokens=False
+                )["bbox"][-(2 + stride) :]
+                overflowing_token_bbox_second_sequence_fast = [[0, 0, 0, 0]] * len(seq_0) + tokenizer(
+                    seq_1, boxes=boxes_1, add_special_tokens=False
+                )["bbox"][-(2 + stride) :]
+
+                bbox_longest_sequence = (
+                    bbox_first_sequence if len(seq0_tokens) > len(seq1_tokens) else bbox_second_sequence
+                )
+                overflowing_token_bbox_longest_sequence_fast = (
+                    overflowing_token_bbox_first_sequence_fast
+                    if len(seq0_tokens) > len(seq1_tokens)
+                    else overflowing_token_bbox_second_sequence_fast
+                )
+
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                    information = tokenizer(
+                        question_0,
+                        seq_1,
+                        boxes=boxes_1,
+                        max_length=len(sequence["input_ids"]) - 2,
+                        add_special_tokens=False,
+                        stride=stride,
+                        truncation="longest_first",
+                        return_overflowing_tokens=True,
+                        # add_prefix_space=False,
+                    )
+                    truncated_sequence = information["input_ids"][0]
+                    overflowing_tokens = information["input_ids"][1]
+                    bbox = information["bbox"][0]
+                    overflowing_bbox = information["bbox"][1]
+                    self.assertEqual(len(information["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
+                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
+                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
+                    self.assertEqual(bbox, bbox_longest_sequence)
+
+                    self.assertEqual(len(overflowing_bbox), 2 + stride + len(smallest))
+                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast)
+                else:
+                    # No overflowing tokens when using 'longest' in python tokenizers
+                    with self.assertRaises(ValueError) as context:
+                        information = tokenizer(
+                            question_0,
+                            seq_1,
+                            boxes=boxes_1,
+                            max_length=len(sequence["input_ids"]) - 2,
+                            add_special_tokens=False,
+                            stride=stride,
+                            truncation="longest_first",
+                            return_overflowing_tokens=True,
+                            # add_prefix_space=False,
+                        )
+
+                    self.assertTrue(
+                        context.exception.args[0].startswith(
+                            "Not possible to return overflowing tokens for pair of sequences with the "
+                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                            "for instance `only_second` or `only_first`."
+                        )
+                    )
+
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                    information = tokenizer(
+                        question_0,
+                        seq_1,
+                        boxes=boxes_1,
+                        max_length=len(sequence["input_ids"]) - 2,
+                        add_special_tokens=False,
+                        stride=stride,
+                        truncation=True,
+                        return_overflowing_tokens=True,
+                        # add_prefix_space=False,
+                    )
+                    truncated_sequence = information["input_ids"][0]
+                    overflowing_tokens = information["input_ids"][1]
+                    bbox = information["bbox"][0]
+                    overflowing_bbox = information["bbox"][1]
+                    self.assertEqual(len(information["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
+                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
+                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
+                    self.assertEqual(bbox, bbox_longest_sequence)
+                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast)
+                else:
+                    # No overflowing tokens when using 'longest' in python tokenizers
+                    with self.assertRaises(ValueError) as context:
+                        information = tokenizer(
+                            question_0,
+                            seq_1,
+                            boxes=boxes_1,
+                            max_length=len(sequence["input_ids"]) - 2,
+                            add_special_tokens=False,
+                            stride=stride,
+                            truncation=True,
+                            return_overflowing_tokens=True,
+                            # add_prefix_space=False,
+                        )
+
+                    self.assertTrue(
+                        context.exception.args[0].startswith(
+                            "Not possible to return overflowing tokens for pair of sequences with the "
+                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                            "for instance `only_second` or `only_first`."
+                        )
+                    )
+
+                information_first_truncated = tokenizer(
+                    question_0,
+                    seq_1,
+                    boxes=boxes_1,
+                    max_length=len(sequence["input_ids"]) - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation="only_first",
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                    truncated_sequence = information_first_truncated["input_ids"][0]
+                    overflowing_tokens = information_first_truncated["input_ids"][1]
+                    bbox = information_first_truncated["bbox"][0]
+                    overflowing_bbox = information_first_truncated["bbox"][1]
+                    self.assertEqual(len(information_first_truncated["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
+                    self.assertEqual(truncated_sequence, truncated_first_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_tokens["input_ids"]))
+                    self.assertEqual(overflowing_tokens, overflow_first_sequence)
+                    self.assertEqual(bbox, bbox_first_sequence)
+                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_fast)
+                else:
+                    truncated_sequence = information_first_truncated["input_ids"]
+                    overflowing_tokens = information_first_truncated["overflowing_tokens"]
+                    overflowing_bbox = information_first_truncated["overflowing_token_boxes"]
+                    bbox = information_first_truncated["bbox"]
+
+                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
+                    self.assertEqual(truncated_sequence, truncated_first_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, seq0_tokens["input_ids"][-(2 + stride) :])
+                    self.assertEqual(bbox, bbox_first_sequence)
+                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_slow)
+
+                information_second_truncated = tokenizer(
+                    question_0,
+                    seq_1,
+                    boxes=boxes_1,
+                    max_length=len(sequence["input_ids"]) - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation="only_second",
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                    truncated_sequence = information_second_truncated["input_ids"][0]
+                    overflowing_tokens = information_second_truncated["input_ids"][1]
+                    bbox = information_second_truncated["bbox"][0]
+                    overflowing_bbox = information_second_truncated["bbox"][1]
+
+                    self.assertEqual(len(information_second_truncated["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
+                    self.assertEqual(truncated_sequence, truncated_second_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_tokens["input_ids"]))
+                    self.assertEqual(overflowing_tokens, overflow_second_sequence)
+                    self.assertEqual(bbox, bbox_second_sequence)
+                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_fast)
+                else:
+                    truncated_sequence = information_second_truncated["input_ids"]
+                    overflowing_tokens = information_second_truncated["overflowing_tokens"]
+                    bbox = information_second_truncated["bbox"]
+                    overflowing_bbox = information_second_truncated["overflowing_token_boxes"]
+
+                    self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2)
+                    self.assertEqual(truncated_sequence, truncated_second_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, seq1_tokens["input_ids"][-(2 + stride) :])
+                    self.assertEqual(bbox, bbox_second_sequence)
+                    self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_slow)
+
+    # @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
    def test_maximum_encoding_length_single_input(self):
-        pass
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
+
+                sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
+                total_length = len(sequence["input_ids"])
+
+                self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_1 = seq_0 * model_max_length
+                boxes_1 = boxes_0 * model_max_length
+                sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                self.assertGreater(
+                    total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
+                )
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"Truncation: {truncation_state}"):
+                                output = tokenizer(
+                                    seq_1,
+                                    boxes=boxes_1,
+                                    padding=padding_state,
+                                    truncation=truncation_state,
+                                )
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+                                self.assertEqual(len(output["bbox"]), model_max_length)
+
+                                output = tokenizer(
+                                    [seq_1],
+                                    boxes=[boxes_1],
+                                    padding=padding_state,
+                                    truncation=truncation_state,
+                                )
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+                                self.assertEqual(len(output["bbox"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        # Reset warnings
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer(seq_1, boxes=boxes_1, padding=padding_state, truncation=False)
+                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
+                            self.assertNotEqual(len(output["bbox"]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer([seq_1], boxes=[boxes_1], padding=padding_state, truncation=False)
+                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+                            self.assertNotEqual(len(output["bbox"][0]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+                # Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
+                stride = 2
+                information = tokenizer(
+                    seq_0,
+                    boxes=boxes_0,
+                    max_length=total_length - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation=True,
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                    truncated_sequence = information["input_ids"][0]
+                    overflowing_tokens = information["input_ids"][1]
+                    bbox = information["bbox"][0]
+                    overflowing_bbox = information["bbox"][1]
+                    self.assertEqual(len(information["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), total_length - 2)
+                    self.assertEqual(truncated_sequence, sequence["input_ids"][:-2])
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :])
+
+                    self.assertEqual(bbox, sequence["bbox"][:-2])
+                    self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
+                else:
+                    truncated_sequence = information["input_ids"]
+                    overflowing_tokens = information["overflowing_tokens"]
+                    bbox = information["bbox"]
+                    overflowing_bbox = information["overflowing_token_boxes"]
+                    self.assertEqual(len(truncated_sequence), total_length - 2)
+                    self.assertEqual(truncated_sequence, sequence["input_ids"][:-2])
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :])
+                    self.assertEqual(bbox, sequence["bbox"][:-2])
+                    self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])

    @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
    def test_pretokenized_inputs(self):