VLM: special multimodal Tokenizer (#34461)

* kinda works

* update

* add tests

* update

* use special tokens in processors

* typo

* fix copies

* fix

* fix moshi after rebase

* update

* fix tests

* update

* Update docs/source/en/main_classes/tokenizer.md

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* update docs

* test for load time adding tokens

* fix some more tests which are now fetched better

* one more fix

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-11-04 16:37:51 +01:00
committed by GitHub
parent ef976a7e18
commit 187439c3fa
35 changed files with 248 additions and 335 deletions

View File

@@ -4156,8 +4156,7 @@ class TokenizerTesterMixin:
special_tokens_list.remove("additional_special_tokens")
special_tokens_map = {}
for token in special_tokens_list:
# Get the private one to avoid unnecessary warnings.
if getattr(tokenizer, f"_{token}") is not None:
if getattr(tokenizer, token) is not None:
special_token = getattr(tokenizer, token)
special_tokens_map[special_token] = f"{special_token}a"
@@ -4169,7 +4168,7 @@ class TokenizerTesterMixin:
# Check the changes
for token in special_tokens_list:
# Get the private one to avoid unnecessary warnings.
if getattr(tokenizer, f"_{token}") is None:
if getattr(tokenizer, token) is None:
continue
special_token = getattr(tokenizer, token)
if special_token in special_tokens_map:
@@ -4411,7 +4410,7 @@ class TokenizerTesterMixin:
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
self.assertEqual(tokenizer._eos_token, new_eos)
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -4449,7 +4448,7 @@ class TokenizerTesterMixin:
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
self.assertEqual(tokenizer_fast._eos_token, new_eos)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):