This commit is contained in:
Ben Mann
2019-04-30 19:48:22 -07:00
committed by GitHub
parent 2dee86319d
commit 74f7906db4

View File

@@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens