Reintroduce clean_text on BertTokenizer call which was removed by mistake in #4723 (#5749)

* Reintroduce clean_text call which was removed by mistake in #4723 Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Added unittest for clean_text parameter on Bert tokenizer. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Better unittest name. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Adapt unittest to use untrained tokenizer. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com> * Code quality + update test Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
2020-10-09 14:07:28 +02:00
parent 5668fdb09e
commit 21ed3a6b99
2 changed files with 12 additions and 0 deletions
--- a/tests/test_tokenization_bert.py
+++ b/tests/test_tokenization_bert.py
@@ -222,6 +222,17 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        self.assertFalse(_is_punctuation("A"))
        self.assertFalse(_is_punctuation(" "))

+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        self.assertListEqual(
+            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+        )
+
    @slow
    def test_sequence_builders(self):
        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")