From 2e6797cc7d467bc2242c54fe61ae61891d19677f Mon Sep 17 00:00:00 2001
From: danai-antoniou <danaiantoniou@monzo.com>
Date: Thu, 19 Sep 2019 15:40:42 +0100
Subject: [PATCH 1/2] Added valuerror for duplicate added tokens

---
 pytorch_transformers/tokenization_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 1e2cd59648..bdc0ec7d3c 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -503,6 +503,9 @@ class PreTrainedTokenizer(object):
         if not new_tokens:
             return 0
 
+        if len(new_tokens) != len(set(new_tokens)):
+            raise ValueError("The provided list of tokens contains duplicates.")
+
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))

From a95158518d65fe640ecb35813280609e27ba3ab7 Mon Sep 17 00:00:00 2001
From: danai-antoniou <danaiantoniou@monzo.com>
Date: Wed, 2 Oct 2019 07:44:15 +0100
Subject: [PATCH 2/2] Moved duplicate token check

---
 transformers/tokenization_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index d8b3c0c74b..de3f48f4c3 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -508,14 +508,12 @@ class PreTrainedTokenizer(object):
         if not new_tokens:
             return 0
 
-        if len(new_tokens) != len(set(new_tokens)):
-            raise ValueError("The provided list of tokens contains duplicates.")
-
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
             if token != self.unk_token and \
-                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
+                    token not in to_add_tokens:
                 to_add_tokens.append(token)
                 logger.info("Adding %s to the vocabulary", token)