gpt-2 tokenizer
This commit is contained in:
@@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
|
|||||||
""" Tokenize a string. """
|
""" Tokenize a string. """
|
||||||
bpe_tokens = []
|
bpe_tokens = []
|
||||||
for token in re.findall(self.pat, text):
|
for token in re.findall(self.pat, text):
|
||||||
|
if sys.version_info[0] == 2:
|
||||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||||
|
else:
|
||||||
|
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||||
return bpe_tokens
|
return bpe_tokens
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user