Fix bloom add prefix space (#25652)

* properly support Sequence of pretokenizers

* actual fix

* make sure the fix works. Tests are not working for sure!

* hacky way

* add TODO

* update

* add a todo

* nits

* rename test

* nits

* rename test
This commit is contained in:
Arthur
2023-08-22 14:50:12 +02:00
committed by GitHub
parent 62396cff46
commit e20fab0bbe
2 changed files with 8 additions and 1 deletions

View File

@@ -133,3 +133,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# maximum sequence length of the positoonal embeddings.
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
def test_add_prefix_space_fast(self):
tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)