From 21ed3a6b993eba06e7f4cf7720f4a07cc8a0d4c2 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Fri, 9 Oct 2020 14:07:28 +0200
Subject: [PATCH] Reintroduce clean_text on BertTokenizer call which was
 removed by mistake in #4723 (#5749)

* Reintroduce clean_text call which was removed by mistake in #4723

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Added unittest for clean_text parameter on Bert tokenizer.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Better unittest name.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Adapt unittest to use untrained tokenizer.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Code quality + update test

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
---
 src/transformers/tokenization_bert.py |  1 +
 tests/test_tokenization_bert.py       | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py
index 3b620865dc..3e646f8774 100644
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -398,6 +398,7 @@ class BasicTokenizer(object):
         """
         # union() returns a new set by concatenating the two sets.
         never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
 
         # This was added on November 1st, 2018 for the multilingual and Chinese
         # models. This is also applied to the English models now, but it doesn't
diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py
index 015e534678..04117d8b3b 100644
--- a/tests/test_tokenization_bert.py
+++ b/tests/test_tokenization_bert.py
@@ -222,6 +222,17 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertFalse(_is_punctuation("A"))
         self.assertFalse(_is_punctuation(" "))
 
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        self.assertListEqual(
+            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+        )
+
     @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")