From 5afa497cbfc53c679a9b22997b6312fad57ee2f8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 11:04:41 +0200 Subject: [PATCH] fix GPT-2 tokenization to work also on python 3... --- pytorch_pretrained_bert/tokenization_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 0e91498f22..80be4435df 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -220,7 +220,7 @@ class GPT2Tokenizer(object): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8')) + token = ''.join(self.byte_encoder[ord(b)] for b in token) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens