Add strip_accents to basic BertTokenizer. (#6280)
* Add strip_accents to basic tokenizer * Add tests for strip_accents. * fix style with black * Fix strip_accents test * empty commit to trigger CI * Improved strip_accents check * Add code quality with is not False
This commit is contained in:
@@ -130,6 +130,30 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_basic_tokenizer_lower_strip_accents_false(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
|
||||
|
||||
def test_basic_tokenizer_lower_strip_accents_true(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_basic_tokenizer_lower_strip_accents_default(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_basic_tokenizer_no_lower(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False)
|
||||
|
||||
@@ -137,6 +161,20 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_no_lower_strip_accents_false(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_no_lower_strip_accents_true(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_respects_never_split_tokens(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user