fixed GPT-2 tokenization on python 2

2019-04-17 10:56:15 +02:00
parent bdaba1897c
commit bc70779bf0
4 changed files with 7 additions and 5 deletions
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -227,7 +227,7 @@ def get_from_cache(url, cache_dir=None):
            meta = {'url': url, 'etag': etag}
            meta_path = cache_path + '.json'
            with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                json.dump(meta, meta_file)
+                meta_file.write(json.dumps(meta))

            logger.info("removing temp file %s", temp_file.name)

--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -59,6 +59,7 @@ def bytes_to_unicode():
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
@@ -67,7 +68,7 @@ def bytes_to_unicode():
            bs.append(b)
            cs.append(2**8+n)
            n += 1
-    cs = [chr(n) for n in cs]
+    cs = [_chr(n) for n in cs]
    return dict(zip(bs, cs))

 def get_pairs(word):
@@ -219,7 +220,7 @@ class GPT2Tokenizer(object):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8'))
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens