From 8b388827b509e0c117c53803f2ee030ead0e5a81 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 5 Dec 2019 11:18:43 +0100 Subject: [PATCH] fix #1920 --- transformers/tokenization_ctrl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py index 3d67fa2c5b..9454cbbaf3 100644 --- a/transformers/tokenization_ctrl.py +++ b/transformers/tokenization_ctrl.py @@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer): """ split_tokens = [] - text = text.split(' ') + words = re.findall(r'\S+\n?', text) - for token in text: + for token in words: split_tokens.extend([t for t in self.bpe(token).split(' ')]) return split_tokens