fix LayoutLMv3TokenizerFast subword label after 'Ġ' token (#21695)

LayoutLMv3TokenizerFast produces empty 'Ġ' token with `offset_mapping = (0, 0)`.
Next token is wrongly assumed to also be beginning of word and isn't
correctly assigned `pad_token_label`.
Modify test with text that produce 'Ġ' token.
Remove copy check from LayoutLMv2TokenizerFast for `_batch_encode_plus`.

solves issue: #19978
This commit is contained in:
Thibault Douzon
2023-04-03 16:32:36 +02:00
committed by GitHub
parent a60010566a
commit 4e441e529c
2 changed files with 12 additions and 8 deletions

View File

@@ -2277,14 +2277,14 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow
def test_only_label_first_subword(self):
words = ["hello", "niels"]
words = ["hello", "niels", "0000000000000000"]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
word_labels = [0, 1]
word_labels = [0, 1, 2]
# test slow tokenizer
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
"microsoft/layoutlmv3-base",
@@ -2292,12 +2292,12 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
add_visual_labels=False,
)
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
# test fast tokenizer
tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
"microsoft/layoutlmv3-base",
@@ -2305,7 +2305,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
add_visual_labels=False,
)
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
@slow
def test_layoutlmv3_integration_test(self):