VLM: special multimodal Tokenizer (#34461)

* kinda works

* update

* add tests

* update

* use special tokens in processors

* typo

* fix copies

* fix

* fix moshi after rebase

* update

* fix tests

* update

* Update docs/source/en/main_classes/tokenizer.md

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* update docs

* test for load time adding tokens

* fix some more tests which are now fetched better

* one more fix

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-11-04 16:37:51 +01:00
committed by GitHub
parent ef976a7e18
commit 187439c3fa
35 changed files with 248 additions and 335 deletions

View File

@@ -299,7 +299,7 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
tokenizer._pad_token = None
tokenizer.pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
with self.assertRaises(ValueError):
# Expect error due to padding token missing
@@ -978,7 +978,7 @@ class TFDataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
tokenizer._pad_token = None
tokenizer.pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
with self.assertRaises(ValueError):
# Expect error due to padding token missing
@@ -1673,7 +1673,7 @@ class NumpyDataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["input_ids"].shape, (2, 16))
self.assertEqual(batch["labels"].shape, (2, 16))
tokenizer._pad_token = None
tokenizer.pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np")
with self.assertRaises(ValueError):
# Expect error due to padding token missing