using raw string for regex to search <extra_id> (#21162)
* using raw string for regex to search <extra_id> * fix the same issue in test file:`tokenization_t5.py`
This commit is contained in:
@@ -214,7 +214,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def get_sentinel_tokens(self):
|
def get_sentinel_tokens(self):
|
||||||
return list(
|
return list(
|
||||||
set(filter(lambda x: bool(re.search("<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
|
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_sentinel_token_ids(self):
|
def get_sentinel_token_ids(self):
|
||||||
|
|||||||
@@ -386,7 +386,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
sentinel_tokens = tokenizer.get_sentinel_tokens()
|
sentinel_tokens = tokenizer.get_sentinel_tokens()
|
||||||
self.assertEquals(len(sentinel_tokens), 10)
|
self.assertEquals(len(sentinel_tokens), 10)
|
||||||
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
|
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
|
||||||
self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
|
self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
|
||||||
|
|
||||||
def test_get_sentinel_token_ids(self):
|
def test_get_sentinel_token_ids(self):
|
||||||
tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
|
tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
|
||||||
@@ -397,7 +397,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
sentinel_tokens = tokenizer.get_sentinel_tokens()
|
sentinel_tokens = tokenizer.get_sentinel_tokens()
|
||||||
self.assertEquals(len(sentinel_tokens), 10)
|
self.assertEquals(len(sentinel_tokens), 10)
|
||||||
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
|
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
|
||||||
self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
|
self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
|
||||||
|
|
||||||
def test_get_sentinel_token_ids_for_fasttokenizer(self):
|
def test_get_sentinel_token_ids_for_fasttokenizer(self):
|
||||||
tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
|
tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
|
||||||
|
|||||||
Reference in New Issue
Block a user