From 982339d82984466fde3b1466f657a03200aa2ffb Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 23 Nov 2018 12:22:12 +0100 Subject: [PATCH] fixing unicode error --- pytorch_pretrained_bert/tokenization.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 5c9369eb4f..ab37539792 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = { 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", } -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - - def printable_text(text): """Returns text encoded in a way suitable for print or `tf.logging`.""" @@ -65,9 +55,9 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 - with open(vocab_file, "r", encoding="utf8") as reader: + with open(vocab_file, "r", encoding="utf-8") as reader: while True: - token = convert_to_unicode(reader.readline()) + token = reader.readline() if not token: break token = token.strip() @@ -164,7 +154,6 @@ class BasicTokenizer(object): def tokenize(self, text): """Tokenizes a piece of text.""" - text = convert_to_unicode(text) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't @@ -290,8 +279,6 @@ class WordpieceTokenizer(object): A list of wordpiece tokens. """ - text = convert_to_unicode(text) - output_tokens = [] for token in whitespace_tokenize(text): chars = list(token)