From a95158518d65fe640ecb35813280609e27ba3ab7 Mon Sep 17 00:00:00 2001 From: danai-antoniou Date: Wed, 2 Oct 2019 07:44:15 +0100 Subject: [PATCH] Moved duplicate token check --- transformers/tokenization_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index d8b3c0c74b..de3f48f4c3 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -508,14 +508,12 @@ class PreTrainedTokenizer(object): if not new_tokens: return 0 - if len(new_tokens) != len(set(new_tokens)): - raise ValueError("The provided list of tokens contains duplicates.") - to_add_tokens = [] for token in new_tokens: assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) if token != self.unk_token and \ - self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token): + self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \ + token not in to_add_tokens: to_add_tokens.append(token) logger.info("Adding %s to the vocabulary", token)