VLM: special multimodal Tokenizer (#34461)
* kinda works * update * add tests * update * use special tokens in processors * typo * fix copies * fix * fix moshi after rebase * update * fix tests * update * Update docs/source/en/main_classes/tokenizer.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * update docs * test for load time adding tokens * fix some more tests which are now fetched better * one more fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
ef976a7e18
commit
187439c3fa
@@ -154,7 +154,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
|
||||
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
|
||||
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
|
||||
self.assertEqual(tokenizer._eos_token, new_eos)
|
||||
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
|
||||
self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
@@ -194,7 +194,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, eos_token=new_eos, from_slow=True
|
||||
)
|
||||
self.assertEqual(tokenizer_fast._eos_token, new_eos)
|
||||
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
||||
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
||||
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
||||
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
|
||||
|
||||
@@ -1659,7 +1659,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
special_tokens_map = {}
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is not None:
|
||||
if getattr(tokenizer, token) is not None:
|
||||
special_token = getattr(tokenizer, token)
|
||||
special_tokens_map[special_token] = f"{special_token}a"
|
||||
|
||||
@@ -1671,7 +1671,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Check the changes
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is None:
|
||||
if getattr(tokenizer, token) is None:
|
||||
continue
|
||||
special_token = getattr(tokenizer, token)
|
||||
if special_token in special_tokens_map:
|
||||
|
||||
@@ -1537,7 +1537,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
special_tokens_map = {}
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is not None:
|
||||
if getattr(tokenizer, token) is not None:
|
||||
special_token = getattr(tokenizer, token)
|
||||
special_tokens_map[special_token] = f"{special_token}a"
|
||||
|
||||
@@ -1549,7 +1549,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Check the changes
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is None:
|
||||
if getattr(tokenizer, token) is None:
|
||||
continue
|
||||
special_token = getattr(tokenizer, token)
|
||||
if special_token in special_tokens_map:
|
||||
|
||||
@@ -1588,7 +1588,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
special_tokens_map = {}
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is not None:
|
||||
if getattr(tokenizer, token) is not None:
|
||||
special_token = getattr(tokenizer, token)
|
||||
special_tokens_map[special_token] = f"{special_token}a"
|
||||
|
||||
@@ -1600,7 +1600,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Check the changes
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is None:
|
||||
if getattr(tokenizer, token) is None:
|
||||
continue
|
||||
special_token = getattr(tokenizer, token)
|
||||
if special_token in special_tokens_map:
|
||||
|
||||
@@ -385,6 +385,7 @@ class LlamaIntegrationTest(unittest.TestCase):
|
||||
assert fast == [1, 319, 4559, 1243]
|
||||
|
||||
fast_tokenizer.add_eos_token = True
|
||||
print(fast_tokenizer.add_eos_token)
|
||||
fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
|
||||
assert fast == [1, 319, 4559, 1243, 2]
|
||||
|
||||
|
||||
@@ -1435,7 +1435,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
special_tokens_map = {}
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is not None:
|
||||
if getattr(tokenizer, token) is not None:
|
||||
special_token = getattr(tokenizer, token)
|
||||
special_tokens_map[special_token] = f"{special_token}a"
|
||||
|
||||
@@ -1447,7 +1447,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Check the changes
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is None:
|
||||
if getattr(tokenizer, token) is None:
|
||||
continue
|
||||
special_token = getattr(tokenizer, token)
|
||||
if special_token in special_tokens_map:
|
||||
|
||||
@@ -237,7 +237,7 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
special_tokens_map = {}
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is not None:
|
||||
if getattr(tokenizer, token) is not None:
|
||||
special_token = getattr(tokenizer, token)
|
||||
special_tokens_map[special_token] = f"{special_token}a"
|
||||
|
||||
@@ -249,7 +249,7 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Check the changes
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is None:
|
||||
if getattr(tokenizer, token) is None:
|
||||
continue
|
||||
special_token = getattr(tokenizer, token)
|
||||
if special_token in special_tokens_map:
|
||||
|
||||
@@ -185,7 +185,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
)
|
||||
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
|
||||
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
|
||||
self.assertEqual(tokenizer._eos_token, new_eos)
|
||||
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
|
||||
self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
@@ -223,7 +223,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
||||
if self.rust_tokenizer_class is not None:
|
||||
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
|
||||
self.assertEqual(tokenizer_fast._eos_token, new_eos)
|
||||
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
||||
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
||||
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
||||
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
|
||||
|
||||
@@ -1538,7 +1538,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
special_tokens_map = {}
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is not None:
|
||||
if getattr(tokenizer, token) is not None:
|
||||
special_token = getattr(tokenizer, token)
|
||||
special_tokens_map[special_token] = f"{special_token}a"
|
||||
|
||||
@@ -1550,7 +1550,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Check the changes
|
||||
for token in special_tokens_list:
|
||||
# Get the private one to avoid unnecessary warnings.
|
||||
if getattr(tokenizer, f"_{token}") is None:
|
||||
if getattr(tokenizer, token) is None:
|
||||
continue
|
||||
special_token = getattr(tokenizer, token)
|
||||
if special_token in special_tokens_map:
|
||||
|
||||
Reference in New Issue
Block a user