clean_up_tokenization_spaces=False if unset (#31938)

* clean_up_tokenization_spaces=False if unset

* deprecate warning

* updating param for old models

* update models

* make fix-copies

* fix-copies and update bert models

* warning msg

* update prophet and clvp

* updating test since space before is arbitrarily removed

* remove warning for 4.45
This commit is contained in:
Ita Zaporozhets
2024-09-26 13:38:20 -04:00
committed by GitHub
parent 3557f9a14a
commit 6730485b02
16 changed files with 67 additions and 7 deletions

View File

@@ -147,8 +147,8 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"])
self.assertEqual(batch_tokens, ["HELLO<unk>!? !?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"])
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
@@ -467,8 +467,8 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?<new_tokens>", "BYE BYE<new_tokens>"])
self.assertEqual(batch_tokens, ["HELLO<unk>!? !?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!? !?<new_tokens>", "BYE BYE<new_tokens>"])
def test_special_characters_in_vocab(self):
sent = "ʈʰ æ æ̃ ˧ kʰ"