Honor existing attention mask in tokenzier.pad (#13926)
* Honor existing attention mask in tokenzier.pad * Fix initialization of attention mask * Roll the implem on all subclasses * Fix tests
This commit is contained in:
@@ -1460,6 +1460,25 @@ class TokenizerTesterMixin:
|
||||
pad_to_multiple_of=8,
|
||||
)
|
||||
|
||||
def test_padding_with_attention_mask(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if tokenizer.pad_token is None:
|
||||
self.skipTest("No padding token.")
|
||||
if "attention_mask" not in tokenizer.model_input_names:
|
||||
self.skipTest("This model does not use attention mask.")
|
||||
|
||||
features = [
|
||||
{"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
|
||||
{"input_ids": [1, 2, 3], "attention_mask": [1, 1, 0]},
|
||||
]
|
||||
padded_features = tokenizer.pad(features)
|
||||
if tokenizer.padding_side == "right":
|
||||
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [1, 1, 0, 0, 0, 0]])
|
||||
else:
|
||||
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
|
||||
Reference in New Issue
Block a user