read().splitlines() -> readlines()
splitlines() does not work as what we expect here for bert-base-chinese because there is a '\u2028' (unicode line seperator) token in vocab file. Value of '\u2028'.splitlines() is ['', '']. Perhaps we should use readlines() instead.
This commit is contained in:
@@ -67,10 +67,9 @@ def load_vocab(vocab_file):
|
|||||||
"""Loads a vocabulary file into a dictionary."""
|
"""Loads a vocabulary file into a dictionary."""
|
||||||
vocab = collections.OrderedDict()
|
vocab = collections.OrderedDict()
|
||||||
with open(vocab_file, "r", encoding="utf-8") as reader:
|
with open(vocab_file, "r", encoding="utf-8") as reader:
|
||||||
tokens = reader.read().splitlines()
|
tokens = reader.readlines()
|
||||||
for index, token in enumerate(tokens):
|
for index, token in enumerate(tokens):
|
||||||
vocab[token] = index
|
vocab[token] = index
|
||||||
index += 1
|
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user