Add split special tokens (#30772)
* seems like `split_special_tokens` is used here * split special token * add new line at end of file * moving split special token test to common tests * added assertions * test * fixup * add co-author * passing rest of args to gptsan_japanese, fixing tests * removing direct comparison of fast and slow models * adding test support for UDOP and LayoutXLM * ruff fix * readd check if slow tokenizer * modify test to handle bos tokens * removing commented function * trigger build * applying review feedback - updated docstrings, var names, and simplified tests * ruff fixes * Update tests/test_tokenization_common.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * applying feedback, comments * shutil temp directory fix --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com> Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MBP.localdomain> Co-authored-by: itazap <itazap@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MacBook-Pro.local>
This commit is contained in:
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import inspect
|
||||
import itertools
|
||||
import json
|
||||
@@ -4168,34 +4167,59 @@ class TokenizerTesterMixin:
|
||||
def test_split_special_tokens(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
return
|
||||
|
||||
# Tests the expected appearance (or absence) of special token in encoded output,
|
||||
# explicit values are not tested because tokenization is model dependent and can change
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
special_token = "[SPECIAL_TOKEN]"
|
||||
special_token = "<my_new_token>"
|
||||
special_sentence = f"Hey this is a {special_token} token"
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
||||
)
|
||||
tokenizer_py = self.tokenizer_class.from_pretrained(
|
||||
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
||||
)
|
||||
|
||||
if not tokenizer.is_fast:
|
||||
# bloom, gptneox etc only have a fast
|
||||
tokenizer.add_special_tokens(
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
AddedToken(special_token, rstrip=True, lstrip=True, normalized=True, special=True)
|
||||
]
|
||||
}
|
||||
)
|
||||
encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
|
||||
self.assertEqual(len(encoded_special_token), 1)
|
||||
special_token_id = tokenizer_py.convert_tokens_to_ids(special_token)
|
||||
encoded_special_token_unsplit = tokenizer_py.encode(
|
||||
special_token, add_special_tokens=False, split_special_tokens=False
|
||||
)
|
||||
self.assertTrue(special_token_id in encoded_special_token_unsplit)
|
||||
|
||||
encoded_split_special_token = tokenizer.encode(
|
||||
special_token, add_special_tokens=False, split_special_tokens=True
|
||||
)
|
||||
if len(encoded_split_special_token) == 1:
|
||||
# if we have subword tokenization or special vocab
|
||||
self.assertTrue(
|
||||
encoded_split_special_token[0] != tokenizer.convert_tokens_to_ids(special_token)
|
||||
)
|
||||
else:
|
||||
self.assertTrue(len(encoded_split_special_token) > 1)
|
||||
encoded_special_token_split = tokenizer_py.encode(special_token, add_special_tokens=False)
|
||||
self.assertTrue(special_token_id not in encoded_special_token_split)
|
||||
|
||||
py_tokens_output = tokenizer_py.tokenize(special_sentence)
|
||||
rust_tokens_output = tokenizer_rust.tokenize(special_sentence)
|
||||
|
||||
self.assertTrue(special_token not in py_tokens_output)
|
||||
self.assertTrue(special_token not in rust_tokens_output)
|
||||
|
||||
py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
|
||||
rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
|
||||
|
||||
self.assertTrue(special_token in py_tokens_output_unsplit)
|
||||
self.assertTrue(special_token in rust_tokens_output_unsplit)
|
||||
|
||||
py_tokens_output = tokenizer_py(special_sentence)
|
||||
rust_tokens_output = tokenizer_rust(special_sentence)
|
||||
|
||||
self.assertTrue(special_token_id not in py_tokens_output)
|
||||
self.assertTrue(special_token_id not in rust_tokens_output)
|
||||
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
|
||||
try:
|
||||
tokenizer_py.save_pretrained(tmp_dir)
|
||||
fast_from_saved = self.tokenizer_class.from_pretrained(tmp_dir)
|
||||
finally:
|
||||
shutil.rmtree(tmp_dir)
|
||||
|
||||
output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
|
||||
self.assertTrue(special_token not in output_tokens_reloaded_split)
|
||||
|
||||
output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False)
|
||||
self.assertTrue(special_token in output_tokens_reloaded_unsplit)
|
||||
|
||||
def test_added_tokens_serialization(self):
|
||||
# Utility to test the added vocab
|
||||
|
||||
Reference in New Issue
Block a user