Add get_vocab method to PretrainedTokenizer

This commit is contained in:
Joe Davison
2020-02-20 15:25:46 -05:00
parent ea8eba35e2
commit 197d74f988
12 changed files with 62 additions and 0 deletions

View File

@@ -119,6 +119,11 @@ class T5Tokenizer(PreTrainedTokenizer):
def vocab_size(self):
return self.sp_model.get_piece_size() + self._extra_ids
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None