From 835b76a46f37ecaa00eebf6d7f190297b44d6103 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Thu, 26 Dec 2019 14:42:55 -0500 Subject: [PATCH] Handle unk_token As we discussed, this is handled here directly cc @thomwolf --- src/transformers/tokenization_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 535322be10..210e47e752 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -1508,7 +1508,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): return self.tokenizer.encode(text).tokens def _convert_token_to_id_with_added_voc(self, token): - return self.tokenizer.token_to_id(token) + id = self.tokenizer.token_to_id(token) + if id is None: + return self.unk_token_id + return id def _convert_id_to_token(self, index): return self.tokenizer.id_to_token(int(index))