Fix BasicTokenizer to respect never_split parameters (#2557)
* add failing test * fix call to _run_split_on_punc * format with black
This commit is contained in:
committed by
Julien Chaumond
parent
6d5049a24d
commit
65a89a8976
@@ -341,7 +341,7 @@ class BasicTokenizer(object):
|
||||
if self.do_lower_case and token not in never_split:
|
||||
token = token.lower()
|
||||
token = self._run_strip_accents(token)
|
||||
split_tokens.extend(self._run_split_on_punc(token))
|
||||
split_tokens.extend(self._run_split_on_punc(token, never_split))
|
||||
|
||||
output_tokens = whitespace_tokenize(" ".join(split_tokens))
|
||||
return output_tokens
|
||||
|
||||
Reference in New Issue
Block a user