From 851f253f4d3fa2414451eeaac82b7a9ad6084675 Mon Sep 17 00:00:00 2001 From: Ren Xuancheng Date: Wed, 3 Apr 2024 23:42:43 +0800 Subject: [PATCH] Fix Qwen2Tokenizer (#29929) qwen2: fixed tokens starting with # in slow tokenizer; add tests Co-authored-by: jklj077 <17811943+jklj077@users.noreply.github.com> --- .../models/qwen2/tokenization_qwen2.py | 4 ++-- tests/models/qwen2/test_tokenization_qwen2.py | 23 +++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/qwen2/tokenization_qwen2.py b/src/transformers/models/qwen2/tokenization_qwen2.py index 22cffcb608..be2685430f 100644 --- a/src/transformers/models/qwen2/tokenization_qwen2.py +++ b/src/transformers/models/qwen2/tokenization_qwen2.py @@ -177,9 +177,9 @@ class Qwen2Tokenizer(PreTrainedTokenizer): self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_merges = [] with open(merges_file, encoding="utf-8") as merges_handle: - for line in merges_handle: + for i, line in enumerate(merges_handle): line = line.strip() - if not line or line.startswith("#"): + if (i == 0 and line.startswith("#version:")) or not line: continue bpe_merges.append(tuple(line.split())) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py index 3193141b84..fba44c6dc8 100644 --- a/tests/models/qwen2/test_tokenization_qwen2.py +++ b/tests/models/qwen2/test_tokenization_qwen2.py @@ -59,6 +59,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ";}", ";}\u010a", "\u00cf\u0135", + "\u0120#", + "##", ] ) @@ -75,6 +77,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): "; }", ";} \u010a", "\u00cf \u0135", + "\u0120 #", + "# #", ] self.special_tokens_map = {"eos_token": "<|endoftext|>"} @@ -129,7 +133,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens - input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 268, 267] + input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 270, 267] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) @unittest.skip("We disable the test of pretokenization as it is not reversible.") @@ -139,6 +143,11 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # the results, by nature, should be different. pass + @unittest.skip("We disable the test of clean up tokenization spaces as it is not applicable.") + def test_clean_up_tokenization_spaces(self): + # it only tests bert-base-uncased and clean_up_tokenization_spaces is not applicable to this tokenizer + pass + def test_nfc_normalization(self): # per https://unicode.org/faq/normalization.html, there are three characters whose normalization forms # under NFC, NFD, NFKC, and NFKD are all different @@ -158,6 +167,16 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_output_string = tokenizer.backend_tokenizer.normalizer.normalize_str(input_string) self.assertEqual(tokenizer_output_string, output_string) + def test_slow_tokenizer_token_with_number_sign(self): + if not self.test_slow_tokenizer: + return + + sequence = " ###" + token_ids = [268, 269] + + tokenizer = self.get_tokenizer() + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequence)), token_ids) + def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self): # Qwen2Tokenizer changes the default `spaces_between_special_tokens` in `decode` to False if not self.test_slow_tokenizer: @@ -166,7 +185,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # tokenizer has a special token: `"<|endfotext|>"` as eos, but it is not `legacy_added_tokens` # special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens` # that would be `"<|im_start|>"` and `"<|im_end|>"` in Qwen/Qwen2 Models - token_ids = [259, 260, 268, 269, 26] + token_ids = [259, 260, 270, 271, 26] sequence = " lower<|endoftext|><|im_start|>;" sequence_with_space = " lower<|endoftext|> <|im_start|> ;"