text in never_split should not lowercase

This commit is contained in:
WrRan
2019-01-08 13:33:57 +08:00
parent 751beb9e73
commit 3f60a60eed

View File

@@ -182,7 +182,7 @@ class BasicTokenizer(object):
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
if self.do_lower_case and token not in self.never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))