From 8ad06b7c13871dc08e985a61ef35d69c0a23bd6d Mon Sep 17 00:00:00 2001 From: Pengfei Liu <59123869+pfliu-nlp@users.noreply.github.com> Date: Wed, 18 Jan 2023 09:43:54 -0500 Subject: [PATCH] using raw string for regex to search (#21162) * using raw string for regex to search * fix the same issue in test file:`tokenization_t5.py` --- src/transformers/models/t5/tokenization_t5.py | 2 +- tests/models/t5/test_tokenization_t5.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 44fc58251c..4bdd3a9077 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -214,7 +214,7 @@ class T5Tokenizer(PreTrainedTokenizer): def get_sentinel_tokens(self): return list( - set(filter(lambda x: bool(re.search("", x)) is not None, self.additional_special_tokens)) + set(filter(lambda x: bool(re.search(r"", x)) is not None, self.additional_special_tokens)) ) def get_sentinel_token_ids(self): diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index 4a8ffb1ced..eb429f750a 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -386,7 +386,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): sentinel_tokens = tokenizer.get_sentinel_tokens() self.assertEquals(len(sentinel_tokens), 10) self.assertListEqual(sorted(sentinel_tokens), sorted([f"" for i in range(0, 10)])) - self.assertTrue([re.search("", token) is not None for token in sentinel_tokens]) + self.assertTrue([re.search(r"", token) is not None for token in sentinel_tokens]) def test_get_sentinel_token_ids(self): tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10) @@ -397,7 +397,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): sentinel_tokens = tokenizer.get_sentinel_tokens() self.assertEquals(len(sentinel_tokens), 10) self.assertListEqual(sorted(sentinel_tokens), sorted([f"" for i in range(0, 10)])) - self.assertTrue([re.search("", token) is not None for token in sentinel_tokens]) + self.assertTrue([re.search(r"", token) is not None for token in sentinel_tokens]) def test_get_sentinel_token_ids_for_fasttokenizer(self): tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)