diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index fab7b0049c..5c9369eb4f 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -65,7 +65,7 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 - with open(vocab_file, "r") as reader: + with open(vocab_file, "r", encoding="utf8") as reader: while True: token = convert_to_unicode(reader.readline()) if not token: