fix GPT-2 tokenization to work also on python 3...

This commit is contained in:
thomwolf
2019-04-17 11:04:41 +02:00
parent bc70779bf0
commit 5afa497cbf

View File

@@ -220,7 +220,7 @@ class GPT2Tokenizer(object):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8'))
token = ''.join(self.byte_encoder[ord(b)] for b in token)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens