VLM: special multimodal Tokenizer (#34461)

* kinda works * update * add tests * update * use special tokens in processors * typo * fix copies * fix * fix moshi after rebase * update * fix tests * update * Update docs/source/en/main_classes/tokenizer.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * update docs * test for load time adding tokens * fix some more tests which are now fetched better * one more fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2024-11-04 16:37:51 +01:00
parent ef976a7e18
commit 187439c3fa
35 changed files with 248 additions and 335 deletions
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -154,7 +154,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))

                with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -194,7 +194,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(
                            pretrained_name, eos_token=new_eos, from_slow=True
                        )
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -1659,7 +1659,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        special_tokens_map = {}
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -1671,7 +1671,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -1537,7 +1537,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        special_tokens_map = {}
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -1549,7 +1549,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -1588,7 +1588,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        special_tokens_map = {}
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -1600,7 +1600,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -385,6 +385,7 @@ class LlamaIntegrationTest(unittest.TestCase):
        assert fast == [1, 319, 4559, 1243]

        fast_tokenizer.add_eos_token = True
+        print(fast_tokenizer.add_eos_token)
        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
        assert fast == [1, 319, 4559, 1243, 2]

--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -1435,7 +1435,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        special_tokens_map = {}
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -1447,7 +1447,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
--- a/tests/models/moshi/test_tokenization_moshi.py
+++ b/tests/models/moshi/test_tokenization_moshi.py
@@ -237,7 +237,7 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        special_tokens_map = {}
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -249,7 +249,7 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
--- a/tests/models/rembert/test_tokenization_rembert.py
+++ b/tests/models/rembert/test_tokenization_rembert.py
@@ -185,7 +185,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                )
                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))

                with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -223,7 +223,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                    if self.rust_tokenizer_class is not None:
                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -1538,7 +1538,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        special_tokens_map = {}
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -1550,7 +1550,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -4156,8 +4156,7 @@ class TokenizerTesterMixin:
        special_tokens_list.remove("additional_special_tokens")
        special_tokens_map = {}
        for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
+            if getattr(tokenizer, token) is not None:
                special_token = getattr(tokenizer, token)
                special_tokens_map[special_token] = f"{special_token}a"

@@ -4169,7 +4168,7 @@ class TokenizerTesterMixin:
        # Check the changes
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
+            if getattr(tokenizer, token) is None:
                continue
            special_token = getattr(tokenizer, token)
            if special_token in special_tokens_map:
@@ -4411,7 +4410,7 @@ class TokenizerTesterMixin:
                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))

                with tempfile.TemporaryDirectory() as tmp_dir_2:
@@ -4449,7 +4448,7 @@ class TokenizerTesterMixin:
                with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                    if self.rust_tokenizer_class is not None:
                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -28,6 +28,7 @@ from transformers import (
    BatchEncoding,
    BertTokenizer,
    BertTokenizerFast,
+    LlamaTokenizerFast,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
    TensorType,
@@ -280,6 +281,54 @@ class TokenizerUtilsTest(unittest.TestCase):
                self.assertEqual(decoded_flat, "##：")
                self.assertEqual(decoded_list, "##：")

+    def test_extra_special_tokens_multimodal(self):
+        special_tokens_list = [
+            "bos_token",
+            "eos_token",
+            "unk_token",
+            "sep_token",
+            "pad_token",
+            "cls_token",
+            "mask_token",
+            "additional_special_tokens",
+        ]
+        llama_tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
+        llama_tokenizer.extra_special_tokens = {
+            "boi_token": "<image_start>",
+            "eoi_token": "<image_end>",
+            "image_token": "<image>",
+        }
+        self.assertListEqual(llama_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, special_tokens_list)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            llama_tokenizer.save_pretrained(tmpdirname)
+
+            # load back and check we have extra special tokens set
+            loaded_tokenizer = LlamaTokenizerFast.from_pretrained(tmpdirname)
+            multimodal_special_tokens_list = special_tokens_list + ["boi_token", "eoi_token", "image_token"]
+            self.assertListEqual(loaded_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, multimodal_special_tokens_list)
+
+            # We set an image_token_id before, so we can get an "image_token" as str that matches the id
+            self.assertTrue(loaded_tokenizer.image_token == "<image>")
+            self.assertTrue(loaded_tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
+
+        # save one more time and make sure the image token can get loaded back
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            loaded_tokenizer.save_pretrained(tmpdirname)
+            loaded_tokenizer_with_extra_tokens = LlamaTokenizerFast.from_pretrained(tmpdirname)
+            self.assertTrue(loaded_tokenizer_with_extra_tokens.image_token == "<image>")
+
+        # test that we can also indicate extra tokens during load time
+        extra_special_tokens = {
+            "boi_token": "<image_start>",
+            "eoi_token": "<image_end>",
+            "image_token": "<image>",
+        }
+        tokenizer = LlamaTokenizerFast.from_pretrained(
+            "huggyllama/llama-7b", extra_special_tokens=extra_special_tokens
+        )
+        self.assertTrue(tokenizer.image_token == "<image>")
+        self.assertTrue(tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
+
    @require_tokenizers
    def test_decoding_skip_special_tokens(self):
        for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
--- a/tests/trainer/test_data_collator.py
+++ b/tests/trainer/test_data_collator.py
@@ -299,7 +299,7 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))

-        tokenizer._pad_token = None
+        tokenizer.pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing
@@ -978,7 +978,7 @@ class TFDataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 16])

-        tokenizer._pad_token = None
+        tokenizer.pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing
@@ -1673,7 +1673,7 @@ class NumpyDataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["input_ids"].shape, (2, 16))
        self.assertEqual(batch["labels"].shape, (2, 16))

-        tokenizer._pad_token = None
+        tokenizer.pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np")
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing