From 982339d82984466fde3b1466f657a03200aa2ffb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 23 Nov 2018 12:22:12 +0100
Subject: [PATCH] fixing unicode error

---
 pytorch_pretrained_bert/tokenization.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 5c9369eb4f..ab37539792 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 }
 
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if isinstance(text, str):
-        return text
-    elif isinstance(text, bytes):
-        return text.decode("utf-8", "ignore")
-    else:
-        raise ValueError("Unsupported string type: %s" % (type(text)))
-
-
 def printable_text(text):
     """Returns text encoded in a way suitable for print or `tf.logging`."""
 
@@ -65,9 +55,9 @@ def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     index = 0
-    with open(vocab_file, "r", encoding="utf8") as reader:
+    with open(vocab_file, "r", encoding="utf-8") as reader:
         while True:
-            token = convert_to_unicode(reader.readline())
+            token = reader.readline()
             if not token:
                 break
             token = token.strip()
@@ -164,7 +154,6 @@ class BasicTokenizer(object):
 
     def tokenize(self, text):
         """Tokenizes a piece of text."""
-        text = convert_to_unicode(text)
         text = self._clean_text(text)
         # This was added on November 1st, 2018 for the multilingual and Chinese
         # models. This is also applied to the English models now, but it doesn't
@@ -290,8 +279,6 @@ class WordpieceTokenizer(object):
           A list of wordpiece tokens.
         """
 
-        text = convert_to_unicode(text)
-
         output_tokens = []
         for token in whitespace_tokenize(text):
             chars = list(token)