From 897d0841bed5e0637aca7dec7744bedc06b54fae Mon Sep 17 00:00:00 2001 From: Yiqing-Zhou <40547184+Yiqing-Zhou@users.noreply.github.com> Date: Mon, 22 Jul 2019 20:49:09 +0800 Subject: [PATCH] read().splitlines() -> readlines() splitlines() does not work as what we expect here for bert-base-chinese because there is a '\u2028' (unicode line seperator) token in vocab file. Value of '\u2028'.splitlines() is ['', '']. Perhaps we should use readlines() instead. --- pytorch_transformers/tokenization_bert.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index f1e900caaf..1ca758eda5 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -67,10 +67,9 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() with open(vocab_file, "r", encoding="utf-8") as reader: - tokens = reader.read().splitlines() + tokens = reader.readlines() for index, token in enumerate(tokens): vocab[token] = index - index += 1 return vocab