Fix BasicTokenizer to respect never_split parameters (#2557)
* add failing test * fix call to _run_split_on_punc * format with black
This commit is contained in:
committed by
Julien Chaumond
parent
6d5049a24d
commit
65a89a8976
@@ -119,6 +119,13 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_respects_never_split_tokens(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
|
||||
)
|
||||
|
||||
def test_wordpiece_tokenizer(self):
|
||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user