fix LayoutLMv3TokenizerFast subword label after 'Ġ' token (#21695)
LayoutLMv3TokenizerFast produces empty 'Ġ' token with `offset_mapping = (0, 0)`. Next token is wrongly assumed to also be beginning of word and isn't correctly assigned `pad_token_label`. Modify test with text that produce 'Ġ' token. Remove copy check from LayoutLMv2TokenizerFast for `_batch_encode_plus`. solves issue: #19978
This commit is contained in:
@@ -508,7 +508,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3
|
|
||||||
def _batch_encode_plus(
|
def _batch_encode_plus(
|
||||||
self,
|
self,
|
||||||
batch_text_or_text_pairs: Union[
|
batch_text_or_text_pairs: Union[
|
||||||
@@ -640,6 +639,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
else:
|
else:
|
||||||
original_index = batch_index
|
original_index = batch_index
|
||||||
labels_example = []
|
labels_example = []
|
||||||
|
previous_token_empty = False
|
||||||
for id, offset, word_id in zip(
|
for id, offset, word_id in zip(
|
||||||
sanitized_tokens["input_ids"][batch_index],
|
sanitized_tokens["input_ids"][batch_index],
|
||||||
sanitized_tokens["offset_mapping"][batch_index],
|
sanitized_tokens["offset_mapping"][batch_index],
|
||||||
@@ -647,11 +647,15 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
):
|
):
|
||||||
if word_id is not None:
|
if word_id is not None:
|
||||||
if self.only_label_first_subword:
|
if self.only_label_first_subword:
|
||||||
if offset[0] == 0:
|
if offset[0] == 0 and not previous_token_empty:
|
||||||
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
|
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
|
||||||
labels_example.append(word_labels[original_index][word_id])
|
labels_example.append(word_labels[original_index][word_id])
|
||||||
else:
|
else:
|
||||||
labels_example.append(self.pad_token_label)
|
labels_example.append(self.pad_token_label)
|
||||||
|
if offset == (0, 0):
|
||||||
|
previous_token_empty = True
|
||||||
|
else:
|
||||||
|
previous_token_empty = False
|
||||||
else:
|
else:
|
||||||
labels_example.append(word_labels[original_index][word_id])
|
labels_example.append(word_labels[original_index][word_id])
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -2277,14 +2277,14 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_only_label_first_subword(self):
|
def test_only_label_first_subword(self):
|
||||||
words = ["hello", "niels"]
|
words = ["hello", "niels", "0000000000000000"]
|
||||||
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
|
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
|
||||||
word_labels = [0, 1]
|
word_labels = [0, 1, 2]
|
||||||
|
|
||||||
# test slow tokenizer
|
# test slow tokenizer
|
||||||
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
|
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
|
||||||
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
|
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
|
||||||
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
|
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
|
||||||
|
|
||||||
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
|
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
|
||||||
"microsoft/layoutlmv3-base",
|
"microsoft/layoutlmv3-base",
|
||||||
@@ -2292,12 +2292,12 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
add_visual_labels=False,
|
add_visual_labels=False,
|
||||||
)
|
)
|
||||||
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
|
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
|
||||||
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
|
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
|
||||||
|
|
||||||
# test fast tokenizer
|
# test fast tokenizer
|
||||||
tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
|
tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
|
||||||
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
|
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
|
||||||
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
|
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
|
||||||
|
|
||||||
tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
|
tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
|
||||||
"microsoft/layoutlmv3-base",
|
"microsoft/layoutlmv3-base",
|
||||||
@@ -2305,7 +2305,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
add_visual_labels=False,
|
add_visual_labels=False,
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
|
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
|
||||||
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
|
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_layoutlmv3_integration_test(self):
|
def test_layoutlmv3_integration_test(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user