fix #1920

2019-12-05 11:18:43 +01:00
parent d425a4d60b
commit 8b388827b5
1 changed files with 2 additions and 2 deletions
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
        """
        split_tokens = []
-        text = text.split(' ')
+        words = re.findall(r'\S+\n?', text)
-        for token in text:
+        for token in words:
            split_tokens.extend([t for t in self.bpe(token).split(' ')])
        return split_tokens