fix tokenization
This commit is contained in:
@@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||||
text = "adapt react readapt apt"
|
text = "adapt react readapt apt"
|
||||||
bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
|
bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
|
||||||
tokens = tokenizer.tokenize(text, add_prefix_space=True)
|
tokens = tokenizer.tokenize(text)
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
""" Converts a sequence of tokens (string) in a single string. """
|
""" Converts a sequence of tokens (string) in a single string. """
|
||||||
out_string = ''.join(tokens).replace('@@', ' ').strip()
|
out_string = ' '.join(tokens).replace('@@ ', '').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
Reference in New Issue
Block a user