From 21ed3a6b993eba06e7f4cf7720f4a07cc8a0d4c2 Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Fri, 9 Oct 2020 14:07:28 +0200 Subject: [PATCH] Reintroduce clean_text on BertTokenizer call which was removed by mistake in #4723 (#5749) * Reintroduce clean_text call which was removed by mistake in #4723 Signed-off-by: Morgan Funtowicz * Added unittest for clean_text parameter on Bert tokenizer. Signed-off-by: Morgan Funtowicz * Better unittest name. Signed-off-by: Morgan Funtowicz * Adapt unittest to use untrained tokenizer. Signed-off-by: Morgan Funtowicz * Code quality + update test Co-authored-by: Lysandre --- src/transformers/tokenization_bert.py | 1 + tests/test_tokenization_bert.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index 3b620865dc..3e646f8774 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -398,6 +398,7 @@ class BasicTokenizer(object): """ # union() returns a new set by concatenating the two sets. never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py index 015e534678..04117d8b3b 100644 --- a/tests/test_tokenization_bert.py +++ b/tests/test_tokenization_bert.py @@ -222,6 +222,17 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertFalse(_is_punctuation("A")) self.assertFalse(_is_punctuation(" ")) + def test_clean_text(self): + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340 + self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]) + + self.assertListEqual( + [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]] + ) + @slow def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")