From 897d0841bed5e0637aca7dec7744bedc06b54fae Mon Sep 17 00:00:00 2001
From: Yiqing-Zhou <40547184+Yiqing-Zhou@users.noreply.github.com>
Date: Mon, 22 Jul 2019 20:49:09 +0800
Subject: [PATCH 1/3] read().splitlines() -> readlines()

splitlines() does not work as what we expect here for bert-base-chinese because there is a '\u2028' (unicode line seperator) token in vocab file. Value of '\u2028'.splitlines() is ['', ''].
Perhaps we should use readlines() instead.
---
 pytorch_transformers/tokenization_bert.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index f1e900caaf..1ca758eda5 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -67,10 +67,9 @@ def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.read().splitlines()
+        tokens = reader.readlines()
     for index, token in enumerate(tokens):
         vocab[token] = index
-        index += 1
     return vocab
 
 

From bef0c629cae56734a5acb38720aea2bdd9d738bd Mon Sep 17 00:00:00 2001
From: Yiqing-Zhou <40547184+Yiqing-Zhou@users.noreply.github.com>
Date: Mon, 22 Jul 2019 22:30:49 +0800
Subject: [PATCH 2/3] fix

Remove '\n' before adding token into vocab
---
 pytorch_transformers/tokenization_bert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 1ca758eda5..acf89b6984 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -69,6 +69,7 @@ def load_vocab(vocab_file):
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
+        token = token[:-1]
         vocab[token] = index
     return vocab
 

From b1019d2a8e5725f4f72fc8abb4085fef8a60c7e4 Mon Sep 17 00:00:00 2001
From: Yiqing-Zhou <40547184+Yiqing-Zhou@users.noreply.github.com>
Date: Tue, 23 Jul 2019 20:41:26 +0800
Subject: [PATCH 3/3] token[-1] -> token.rstrip('\n')

---
 pytorch_transformers/tokenization_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index acf89b6984..f9c97b7d12 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -69,7 +69,7 @@ def load_vocab(vocab_file):
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
-        token = token[:-1]
+        token = token.rstrip('\n')
         vocab[token] = index
     return vocab