From 74f7906db460d81e4807249aaf73290db2b7d43c Mon Sep 17 00:00:00 2001 From: Ben Mann <8enmann@gmail.com> Date: Tue, 30 Apr 2019 19:48:22 -0700 Subject: [PATCH] Fix #537 --- pytorch_pretrained_bert/tokenization_gpt2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 07777292a3..8ffd7a68e2 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -221,7 +221,10 @@ class GPT2Tokenizer(object): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[ord(b)] for b in token) + if sys.version_info[0] == 2: + token = ''.join(self.byte_encoder[ord(b)] for b in token) + else: + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens