Enable padding_side as call time kwargs (#33385)
* fix * add padding-side kwarg * add padding side in all models & fix tests * fix copies * fix tests
This commit is contained in:
committed by
GitHub
parent
1027a532c5
commit
4b0418df11
@@ -19,6 +19,8 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutXLMTokenizerFast,
|
||||
@@ -324,7 +326,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
|
||||
self.assertIn(decoded, [output, output.lower()])
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -375,15 +378,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
tokenizer_kwargs_right = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "right"
|
||||
else:
|
||||
tokenizer_kwargs_right["padding_side"] = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
@@ -394,14 +400,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
tokenizer_kwargs_left = {
|
||||
"max_length": sequence_length + padding_size,
|
||||
"padding": "max_length",
|
||||
"return_special_tokens_mask": True,
|
||||
}
|
||||
|
||||
if not use_padding_as_call_kwarg:
|
||||
tokenizer.padding_side = "left"
|
||||
else:
|
||||
tokenizer_kwargs_left["padding_side"] = "left"
|
||||
|
||||
left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
Reference in New Issue
Block a user