From 3f60a60eede2129f01a31cea73f77b3338e5e894 Mon Sep 17 00:00:00 2001 From: WrRan Date: Tue, 8 Jan 2019 13:33:57 +0800 Subject: [PATCH] text in never_split should not lowercase --- pytorch_pretrained_bert/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 9cb36a1b46..595eb8fdaa 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -182,7 +182,7 @@ class BasicTokenizer(object): orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: - if self.do_lower_case: + if self.do_lower_case and token not in self.never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token))