From bef0c629cae56734a5acb38720aea2bdd9d738bd Mon Sep 17 00:00:00 2001 From: Yiqing-Zhou <40547184+Yiqing-Zhou@users.noreply.github.com> Date: Mon, 22 Jul 2019 22:30:49 +0800 Subject: [PATCH] fix Remove '\n' before adding token into vocab --- pytorch_transformers/tokenization_bert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index 1ca758eda5..acf89b6984 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -69,6 +69,7 @@ def load_vocab(vocab_file): with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): + token = token[:-1] vocab[token] = index return vocab