From 248314772f76a2d8d7aa2d6d51983a956f8aad69 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 8 Oct 2019 17:19:28 +0200 Subject: [PATCH] fix tokenization --- transformers/tests/tokenization_ctrl_test.py | 2 +- transformers/tokenization_ctrl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/tests/tokenization_ctrl_test.py b/transformers/tests/tokenization_ctrl_test.py index fbd99af7bb..ad16cf07fa 100644 --- a/transformers/tests/tokenization_ctrl_test.py +++ b/transformers/tests/tokenization_ctrl_test.py @@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "adapt react readapt apt" bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split() - tokens = tokenizer.tokenize(text, add_prefix_space=True) + tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py index a57fae7b1f..afe8fa70e3 100644 --- a/transformers/tokenization_ctrl.py +++ b/transformers/tokenization_ctrl.py @@ -205,7 +205,7 @@ class CTRLTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ''.join(tokens).replace('@@', ' ').strip() + out_string = ' '.join(tokens).replace('@@ ', '').strip() return out_string def save_vocabulary(self, save_directory):