VLM: special multimodal Tokenizer (#34461)

* kinda works

* update

* add tests

* update

* use special tokens in processors

* typo

* fix copies

* fix

* fix moshi after rebase

* update

* fix tests

* update

* Update docs/source/en/main_classes/tokenizer.md

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* update docs

* test for load time adding tokens

* fix some more tests which are now fetched better

* one more fix

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-11-04 16:37:51 +01:00
committed by GitHub
parent ef976a7e18
commit 187439c3fa
35 changed files with 248 additions and 335 deletions

View File

@@ -1659,7 +1659,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
special_tokens_map = {}
for token in special_tokens_list:
# Get the private one to avoid unnecessary warnings.
if getattr(tokenizer, f"_{token}") is not None:
if getattr(tokenizer, token) is not None:
special_token = getattr(tokenizer, token)
special_tokens_map[special_token] = f"{special_token}a"
@@ -1671,7 +1671,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Check the changes
for token in special_tokens_list:
# Get the private one to avoid unnecessary warnings.
if getattr(tokenizer, f"_{token}") is None:
if getattr(tokenizer, token) is None:
continue
special_token = getattr(tokenizer, token)
if special_token in special_tokens_map: