fix #328

2019-07-15 12:42:12 +02:00
parent f7cd7392fd
commit a9ab15174c
1 changed files with 7 additions and 10 deletions
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -66,13 +66,9 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r", encoding="utf-8") as reader:
-        while True:
+        tokens = reader.read().splitlines()
-            token = reader.readline()
+    for index, token in enumerate(tokens):
            if not token:
                break
            token = token.strip()
        vocab[token] = index
        index += 1
    return vocab
@@ -213,7 +209,7 @@ class BasicTokenizer(object):
        self.do_lower_case = do_lower_case
        self.never_split = never_split
-    def tokenize(self, text, never_split=None):
+    def tokenize(self, text, never_split=None, tokenize_chinese_chars=True):
        """Tokenizes a piece of text."""
        never_split = self.never_split + (never_split if never_split is not None else [])
        text = self._clean_text(text)
@@ -223,6 +219,7 @@ class BasicTokenizer(object):
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        if tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []