is_pretokenized -> is_split_into_words (#7236)

* is_pretokenized -> is_split_into_words

* Fix tests
This commit is contained in:
Sylvain Gugger
2020-09-22 09:34:35 -04:00
committed by GitHub
parent 324f361e91
commit 21ca148090
9 changed files with 142 additions and 72 deletions

View File

@@ -743,7 +743,7 @@ class TokenizerTesterMixin:
# formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
# self.assertEqual(
# tokenizer.encode(tokens, is_pretokenized=True, add_special_tokens=True), formatted_input
# tokenizer.encode(tokens, is_split_into_words=True, add_special_tokens=True), formatted_input
# )
# # This is not supported with the Rust tokenizers
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
@@ -1250,20 +1250,20 @@ class TokenizerTesterMixin:
# sequence_no_prefix_space = sequence.strip()
# Test encode for pretokenized inputs
output = tokenizer.encode(token_sequence, is_pretokenized=True, add_special_tokens=False)
output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=False)
output_sequence = tokenizer.encode(sequence, add_special_tokens=False)
self.assertEqual(output, output_sequence)
output = tokenizer.encode(token_sequence, is_pretokenized=True, add_special_tokens=True)
output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=True)
output_sequence = tokenizer.encode(sequence, add_special_tokens=True)
self.assertEqual(output, output_sequence)
# Test encode_plus for pretokenized inputs
output = tokenizer.encode_plus(token_sequence, is_pretokenized=True, add_special_tokens=False)
output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=False)
output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=False)
for key in output.keys():
self.assertEqual(output[key], output_sequence[key])
output = tokenizer.encode_plus(token_sequence, is_pretokenized=True, add_special_tokens=True)
output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=True)
output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=True)
for key in output.keys():
self.assertEqual(output[key], output_sequence[key])
@@ -1274,7 +1274,7 @@ class TokenizerTesterMixin:
sequence_batch_cleaned_up_spaces = [" " + " ".join(s) for s in token_sequence_batch]
output = tokenizer.batch_encode_plus(
token_sequence_batch, is_pretokenized=True, add_special_tokens=False
token_sequence_batch, is_split_into_words=True, add_special_tokens=False
)
output_sequence = tokenizer.batch_encode_plus(
sequence_batch_cleaned_up_spaces, add_special_tokens=False
@@ -1282,7 +1282,7 @@ class TokenizerTesterMixin:
for key in output.keys():
self.assertEqual(output[key], output_sequence[key])
output = tokenizer.batch_encode_plus(
token_sequence_batch, is_pretokenized=True, add_special_tokens=True
token_sequence_batch, is_split_into_words=True, add_special_tokens=True
)
output_sequence = tokenizer.batch_encode_plus(
sequence_batch_cleaned_up_spaces, add_special_tokens=True
@@ -1292,25 +1292,25 @@ class TokenizerTesterMixin:
# Test encode for pretokenized inputs pairs
output = tokenizer.encode(
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=False
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
)
output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=False)
self.assertEqual(output, output_sequence)
output = tokenizer.encode(
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=True
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
)
output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=True)
self.assertEqual(output, output_sequence)
# Test encode_plus for pretokenized inputs pairs
output = tokenizer.encode_plus(
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=False
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
)
output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=False)
for key in output.keys():
self.assertEqual(output[key], output_sequence[key])
output = tokenizer.encode_plus(
token_sequence, token_sequence, is_pretokenized=True, add_special_tokens=True
token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
)
output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=True)
for key in output.keys():
@@ -1326,7 +1326,7 @@ class TokenizerTesterMixin:
]
output = tokenizer.batch_encode_plus(
token_sequence_pair_batch, is_pretokenized=True, add_special_tokens=False
token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=False
)
output_sequence = tokenizer.batch_encode_plus(
sequence_pair_batch_cleaned_up_spaces, add_special_tokens=False
@@ -1334,7 +1334,7 @@ class TokenizerTesterMixin:
for key in output.keys():
self.assertEqual(output[key], output_sequence[key])
output = tokenizer.batch_encode_plus(
token_sequence_pair_batch, is_pretokenized=True, add_special_tokens=True
token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=True
)
output_sequence = tokenizer.batch_encode_plus(
sequence_pair_batch_cleaned_up_spaces, add_special_tokens=True

View File

@@ -340,12 +340,12 @@ class CommonFastTokenizerTest(unittest.TestCase):
pretokenized_input_pair = "This is a sample pair".split()
# Test encode for pretokenized inputs
output_r = tokenizer_r.encode(pretokenized_input_simple, is_pretokenized=True)
output_p = tokenizer_p.encode(pretokenized_input_simple, is_pretokenized=True)
output_r = tokenizer_r.encode(pretokenized_input_simple, is_split_into_words=True)
output_p = tokenizer_p.encode(pretokenized_input_simple, is_split_into_words=True)
self.assertEqual(output_p, output_r)
kwargs = {
"is_pretokenized": True,
"is_split_into_words": True,
"return_token_type_ids": True,
"return_attention_mask": True,
"return_overflowing_tokens": False,
@@ -353,7 +353,7 @@ class CommonFastTokenizerTest(unittest.TestCase):
"return_offsets_mapping": False, # Not implemented in python tokenizers
}
batch_kwargs = {
"is_pretokenized": True,
"is_split_into_words": True,
"return_token_type_ids": True,
"return_attention_mask": True, # we have an 's' here
"return_overflowing_tokens": False,
@@ -374,8 +374,8 @@ class CommonFastTokenizerTest(unittest.TestCase):
self.assertEqual(output_p[key], output_r[key])
# Test encode for pretokenized inputs pairs
output_r = tokenizer_r.encode(pretokenized_input_simple, pretokenized_input_pair, is_pretokenized=True)
output_p = tokenizer_p.encode(pretokenized_input_simple, pretokenized_input_pair, is_pretokenized=True)
output_r = tokenizer_r.encode(pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True)
output_p = tokenizer_p.encode(pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True)
self.assertEqual(output_p, output_r)
# Test encode_plus for pretokenized inputs