fix GPT-2 tokenization to work also on python 3...
This commit is contained in:
@@ -220,7 +220,7 @@ class GPT2Tokenizer(object):
|
||||
""" Tokenize a string. """
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8'))
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
|
||||
Reference in New Issue
Block a user