fix tokenization

This commit is contained in:
thomwolf
2019-10-08 17:19:28 +02:00
parent 03c2c762a6
commit 248314772f
2 changed files with 2 additions and 2 deletions

View File

@@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "adapt react readapt apt"
bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
tokens = tokenizer.tokenize(text, add_prefix_space=True)
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]