Enable padding_side as call time kwargs (#33385)

* fix

* add padding-side kwarg

* add padding side in all models & fix tests

* fix copies

* fix tests
This commit is contained in:
Raushan Turganbay
2024-09-13 12:58:38 +02:00
committed by GitHub
parent 1027a532c5
commit 4b0418df11
26 changed files with 528 additions and 149 deletions

View File

@@ -21,6 +21,8 @@ import tempfile
import unittest
from typing import List
from parameterized import parameterized
from transformers import (
AddedToken,
LayoutLMv2TokenizerFast,
@@ -393,7 +395,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_split_special_tokens(self):
pass
def test_encode_plus_with_padding(self):
@parameterized.expand([(True,), (False,)])
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -444,15 +447,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
tokenizer_kwargs_right = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
right_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "right"
else:
tokenizer_kwargs_right["padding_side"] = "right"
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -463,14 +469,18 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
tokenizer_kwargs_left = {
"max_length": sequence_length + padding_size,
"padding": "max_length",
"return_special_tokens_mask": True,
}
if not use_padding_as_call_kwarg:
tokenizer.padding_side = "left"
else:
tokenizer_kwargs_left["padding_side"] = "left"
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)