This commit is contained in:
thomwolf
2019-12-05 11:18:43 +01:00
parent d425a4d60b
commit 8b388827b5

View File

@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
""" """
split_tokens = [] split_tokens = []
text = text.split(' ') words = re.findall(r'\S+\n?', text)
for token in text: for token in words:
split_tokens.extend([t for t in self.bpe(token).split(' ')]) split_tokens.extend([t for t in self.bpe(token).split(' ')])
return split_tokens return split_tokens