Add get_vocab method to PretrainedTokenizer

This commit is contained in:
Joe Davison
2020-02-20 15:25:46 -05:00
parent ea8eba35e2
commit 197d74f988
12 changed files with 62 additions and 0 deletions

View File

@@ -195,6 +195,9 @@ class BertTokenizer(PreTrainedTokenizer):
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize: