fix GPT-2 tokenization to work also on python 3...
This commit is contained in:
@@ -220,7 +220,7 @@ class GPT2Tokenizer(object):
|
|||||||
""" Tokenize a string. """
|
""" Tokenize a string. """
|
||||||
bpe_tokens = []
|
bpe_tokens = []
|
||||||
for token in re.findall(self.pat, text):
|
for token in re.findall(self.pat, text):
|
||||||
token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8'))
|
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||||
return bpe_tokens
|
return bpe_tokens
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user