fix #1920
This commit is contained in:
@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
|||||||
"""
|
"""
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
|
|
||||||
text = text.split(' ')
|
words = re.findall(r'\S+\n?', text)
|
||||||
|
|
||||||
for token in text:
|
for token in words:
|
||||||
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user