Merge pull request #42 from weiyumou/master
Fixed UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2
This commit is contained in:
@@ -99,7 +99,7 @@ from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
|
|||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
# Tokenized input
|
# Tokenized input
|
||||||
tokenized_text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
tokenized_text = tokenizer.tokenize(text)
|
||||||
|
|
||||||
# Mask a token that we will try to predict back with `BertForMaskedLM`
|
# Mask a token that we will try to predict back with `BertForMaskedLM`
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ def load_vocab(vocab_file):
|
|||||||
"""Loads a vocabulary file into a dictionary."""
|
"""Loads a vocabulary file into a dictionary."""
|
||||||
vocab = collections.OrderedDict()
|
vocab = collections.OrderedDict()
|
||||||
index = 0
|
index = 0
|
||||||
with open(vocab_file, "r") as reader:
|
with open(vocab_file, "r", encoding="utf8") as reader:
|
||||||
while True:
|
while True:
|
||||||
token = convert_to_unicode(reader.readline())
|
token = convert_to_unicode(reader.readline())
|
||||||
if not token:
|
if not token:
|
||||||
|
|||||||
Reference in New Issue
Block a user