is_pretokenized -> is_split_into_words (#7236)
* is_pretokenized -> is_split_into_words * Fix tests
This commit is contained in:
@@ -743,7 +743,7 @@ class TokenizerTesterMixin:
|
||||
# formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
|
||||
|
||||
# self.assertEqual(
|
||||
# tokenizer.encode(tokens, is_pretokenized=True, add_special_tokens=True), formatted_input
|
||||
# tokenizer.encode(tokens, is_split_into_words=True, add_special_tokens=True), formatted_input
|
||||
# )
|
||||
# # This is not supported with the Rust tokenizers
|
||||
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
|
||||
@@ -1250,20 +1250,20 @@ class TokenizerTesterMixin:
|
||||
# sequence_no_prefix_space = sequence.strip()
|
||||
|
||||
# Test encode for pretokenized inputs
|
||||
output = tokenizer.encode(token_sequence, is_pretokenized=True, add_special_tokens=False)
|
||||
output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=False)
|
||||
output_sequence = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertEqual(output, output_sequence)
|
||||
|
||||
output = tokenizer.encode(token_sequence, is_pretokenized=True, add_special_tokens=True)
|
||||
output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=True)
|
||||
output_sequence = tokenizer.encode(sequence, add_special_tokens=True)
|
||||
self.assertEqual(output, output_sequence)
|
||||
|
||||
# Test encode_plus for pretokenized inputs
|
||||
output = tokenizer.encode_plus(token_sequence, is_pretokenized=True, add_special_tokens=False)
|
||||
output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=False)
|
||||
output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=False)
|
||||
for key in output.keys():
|
||||
self.assertEqual(output[key], output_sequence[key])
|
||||
output = tokenizer.encode_plus(token_sequence, is_pretokenized=True, add_special_tokens=True)
|
||||
output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=True)
|
||||
output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=True)
|
||||
for key in output.keys():
|
||||
self.assertEqual(output[key], output_sequence[key])
|
||||
@@ -1274,7 +1274,7 @@ class TokenizerTesterMixin:
|
||||
sequence_batch_cleaned_up_spaces = [" " + " ".join(s) for s in token_sequence_batch]
|
||||
|
||||
output = tokenizer.batch_encode_plus(
|
||||
token_sequence_batch, is_pretokenized=True, add_special_tokens=False
|
||||
token_sequence_batch, is_split_into_words=True, add_special_tokens=False
|
||||
)
|
||||
output_sequence = tokenizer.batch_encode_plus(
|
||||
sequence_batch_cleaned_up_spaces, add_special_tokens=False
|
||||
@@ -1282,7 +1282,7 @@ class TokenizerTesterMixin:
|
||||
for key in output.keys():
|
||||
self.assertEqual(output[key], output_sequence[key])
|
||||
output = tokenizer.batch_encode_plus(
|
||||
token_sequence_batch, is_pretokenized=True, add_special_tokens=True
|
||||
token_sequence_batch, is_split_into_words=True, add_special_tokens=True
|
||||
)
|
||||
output_sequence = tokenizer.batch_encode_plus(
|
||||
sequence_batch_cleaned_up_spaces, add_special_tokens=True
|
||||
@@ -1292,25 +1292,25 @@ class TokenizerTesterMixin:
|
||||
|
||||
# Test encode for pretokenized inputs pairs
|
||||
output = tokenizer.encode(
|
||||
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=False
|
||||
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
|
||||
)
|
||||
output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=False)
|
||||
self.assertEqual(output, output_sequence)
|
||||
output = tokenizer.encode(
|
||||
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=True
|
||||
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
|
||||
)
|
||||
output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=True)
|
||||
self.assertEqual(output, output_sequence)
|
||||
|
||||
# Test encode_plus for pretokenized inputs pairs
|
||||
output = tokenizer.encode_plus(
|
||||
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=False
|
||||
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
|
||||
)
|
||||
output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=False)
|
||||
for key in output.keys():
|
||||
self.assertEqual(output[key], output_sequence[key])
|
||||
output = tokenizer.encode_plus(
|
||||
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=True
|
||||
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
|
||||
)
|
||||
output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=True)
|
||||
for key in output.keys():
|
||||
@@ -1326,7 +1326,7 @@ class TokenizerTesterMixin:
|
||||
]
|
||||
|
||||
output = tokenizer.batch_encode_plus(
|
||||
token_sequence_pair_batch, is_pretokenized=True, add_special_tokens=False
|
||||
token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=False
|
||||
)
|
||||
output_sequence = tokenizer.batch_encode_plus(
|
||||
sequence_pair_batch_cleaned_up_spaces, add_special_tokens=False
|
||||
@@ -1334,7 +1334,7 @@ class TokenizerTesterMixin:
|
||||
for key in output.keys():
|
||||
self.assertEqual(output[key], output_sequence[key])
|
||||
output = tokenizer.batch_encode_plus(
|
||||
token_sequence_pair_batch, is_pretokenized=True, add_special_tokens=True
|
||||
token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=True
|
||||
)
|
||||
output_sequence = tokenizer.batch_encode_plus(
|
||||
sequence_pair_batch_cleaned_up_spaces, add_special_tokens=True
|
||||
|
||||
Reference in New Issue
Block a user