Skipping outputs (#3116)
* Minimal example * Proposal 2 * Proposal 2 for fast tokenizers * Typings * Docs * Revert "Docs" for easier review This reverts commit eaf0f97062e809887704a542144c537f769d5223. * Remove unnecessary assignments * Tests * Fix faulty type * Remove prints * return_outputs -> model_input_names * Revert "Revert "Docs" for easier review" This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722. * code quality
This commit is contained in:
@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
|
||||
# to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
|
||||
return [
|
||||
{value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
|
||||
for i in range(len(batch_encode_plus_sequences))
|
||||
for i in range(len(batch_encode_plus_sequences["input_ids"]))
|
||||
]
|
||||
|
||||
def test_tokenizers_common_properties(self):
|
||||
@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
|
||||
def test_mask_output(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
|
||||
if (
|
||||
tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
|
||||
and "token_type_ids" in tokenizer.model_input_names
|
||||
):
|
||||
seq_0 = "Test this method."
|
||||
seq_1 = "With these inputs."
|
||||
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
|
||||
@@ -504,51 +507,58 @@ class TokenizerTesterMixin:
|
||||
|
||||
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
|
||||
input_ids = encoded_sequence["input_ids"]
|
||||
token_type_ids = encoded_sequence["token_type_ids"]
|
||||
attention_mask = encoded_sequence["attention_mask"]
|
||||
special_tokens_mask = encoded_sequence["special_tokens_mask"]
|
||||
sequence_length = len(input_ids)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
padded_sequence = tokenizer.encode_plus(
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
pad_to_max_length=True,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
padded_input_ids = padded_sequence["input_ids"]
|
||||
padded_token_type_ids = padded_sequence["token_type_ids"]
|
||||
padded_attention_mask = padded_sequence["attention_mask"]
|
||||
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
|
||||
padded_sequence_length = len(padded_input_ids)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert input_ids + [padding_idx] * padding_size == padded_input_ids
|
||||
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
|
||||
assert attention_mask + [0] * padding_size == padded_attention_mask
|
||||
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
right_padded_sequence_length = len(right_padded_input_ids)
|
||||
|
||||
assert sequence_length + padding_size == right_padded_sequence_length
|
||||
assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
|
||||
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
padded_sequence = tokenizer.encode_plus(
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
pad_to_max_length=True,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
padded_input_ids = padded_sequence["input_ids"]
|
||||
padded_token_type_ids = padded_sequence["token_type_ids"]
|
||||
padded_attention_mask = padded_sequence["attention_mask"]
|
||||
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
|
||||
padded_sequence_length = len(padded_input_ids)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert [padding_idx] * padding_size + input_ids == padded_input_ids
|
||||
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
|
||||
assert [0] * padding_size + attention_mask == padded_attention_mask
|
||||
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
|
||||
assert sequence_length + padding_size == left_padded_sequence_length
|
||||
assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
|
||||
assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
|
||||
|
||||
if "token_type_ids" in tokenizer.model_input_names:
|
||||
token_type_ids = encoded_sequence["token_type_ids"]
|
||||
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
|
||||
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
|
||||
|
||||
assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
|
||||
assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
|
||||
|
||||
if "attention_mask" in tokenizer.model_input_names:
|
||||
attention_mask = encoded_sequence["attention_mask"]
|
||||
right_padded_attention_mask = right_padded_sequence["attention_mask"]
|
||||
left_padded_attention_mask = left_padded_sequence["attention_mask"]
|
||||
|
||||
assert attention_mask + [0] * padding_size == right_padded_attention_mask
|
||||
assert [0] * padding_size + attention_mask == left_padded_attention_mask
|
||||
|
||||
def test_separate_tokenizers(self):
|
||||
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
|
||||
|
||||
Reference in New Issue
Block a user