This commit is contained in:
thomwolf
2019-12-05 11:18:43 +01:00
parent d425a4d60b
commit 8b388827b5

View File

@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
"""
split_tokens = []
text = text.split(' ')
words = re.findall(r'\S+\n?', text)
for token in text:
for token in words:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
return split_tokens