From 2e6797cc7d467bc2242c54fe61ae61891d19677f Mon Sep 17 00:00:00 2001 From: danai-antoniou Date: Thu, 19 Sep 2019 15:40:42 +0100 Subject: [PATCH 1/2] Added valuerror for duplicate added tokens --- pytorch_transformers/tokenization_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 1e2cd59648..bdc0ec7d3c 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -503,6 +503,9 @@ class PreTrainedTokenizer(object): if not new_tokens: return 0 + if len(new_tokens) != len(set(new_tokens)): + raise ValueError("The provided list of tokens contains duplicates.") + to_add_tokens = [] for token in new_tokens: assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) From a95158518d65fe640ecb35813280609e27ba3ab7 Mon Sep 17 00:00:00 2001 From: danai-antoniou Date: Wed, 2 Oct 2019 07:44:15 +0100 Subject: [PATCH 2/2] Moved duplicate token check --- transformers/tokenization_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index d8b3c0c74b..de3f48f4c3 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -508,14 +508,12 @@ class PreTrainedTokenizer(object): if not new_tokens: return 0 - if len(new_tokens) != len(set(new_tokens)): - raise ValueError("The provided list of tokens contains duplicates.") - to_add_tokens = [] for token in new_tokens: assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) if token != self.unk_token and \ - self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token): + self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \ + token not in to_add_tokens: to_add_tokens.append(token) logger.info("Adding %s to the vocabulary", token)