Add Number Normalisation for SpeechT5 (#25447)

* add: NumberNormalizer works for integers, floats, common currencies, negative numbers and percentages * fix: renamed number normalizer class and added normalization to SpeechT5Processor * fix: restyled with black and ruff, should pass code quality tests * fix: moved normalization to tokenizer and other small changes to normalizer * add: test for normalization and changed the existing full tokenizer test * fix: tokenization tests now pass, made changes to existing tokenization where normalization is covered; added normalize arg to func signature * fix: changed default normalize setting to False, modified the tests a bit * fix: added support for comma separated numbers, tokenization on the fly with kwargs and normalizer getter setter funcs
2023-08-22 11:42:57 +05:30
parent 58c36bea74
commit 182b83749a
3 changed files with 233 additions and 4 deletions
--- a/tests/models/speecht5/test_tokenization_speecht5.py
+++ b/tests/models/speecht5/test_tokenization_speecht5.py
@@ -52,12 +52,24 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
        output_text = "this is a test"
        return input_text, output_text

+    def get_numeric_input_output_texts(self):
+        input_text = "I have $123.45 and owe €59.78. My balance is -₴876.90 and have 73% stocks in my company which equals to ₦72649201"
+        output_text = "I have one hundred and twenty three point four five dollars and owe fifty nine point seven eight euros. My balance is minus eight hundred and seventy six point nine zero ukrainian hryvnia and have seventy three percent stocks in my company which equals to seventy two million six hundred and forty nine thousand two hundred and one nigerian naira"
+        return input_text, output_text
+
    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
        input_text, output_text = self.get_input_output_texts(tokenizer)
        ids = tokenizer.encode(output_text, add_special_tokens=False)
        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
        return text, ids

+    def test_tokenizer_normalization(self):
+        tokenizer = self.get_tokenizer(normalize=True)
+        input_text, expected_text = self.get_numeric_input_output_texts()
+        input_ids = tokenizer.encode(input_text)
+        output_text = tokenizer.decode(input_ids, skip_special_tokens=True)
+        self.assertEqual(output_text, expected_text)
+
    def test_convert_token_and_id(self):
        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
        token = "<pad>"
@@ -137,7 +149,7 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
        pass

    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
+        tokenizer = self.get_tokenizer(normalize=True)

        tokens = tokenizer.tokenize("This is a test")
        # fmt: off
@@ -153,20 +165,20 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
        self.assertListEqual(
            tokens,
            # fmt: off
-            [SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, '92000', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.']
+            [SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, 'n', 'i', 'n', 'e', 't', 'y', SPIECE_UNDERLINE, 't', 'w', 'o', SPIECE_UNDERLINE, 't', 'h', 'o', 'u', 's', 'a', 'n', 'd', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.']
            # fmt: on
        )

        ids = tokenizer.convert_tokens_to_ids(tokens)
        # fmt: off
-        self.assertListEqual(ids, [4, 30, 4, 20, 7, 12, 4, 25, 8, 13, 9, 4, 10, 9, 4, 3, 23, 4, 7, 9, 14, 4, 6, 11, 10, 12, 4, 10, 12, 4, 19, 7, 15, 12, 73, 26])
+        self.assertListEqual(ids, [4, 30, 4, 20, 7, 12, 4, 25, 8, 13, 9, 4, 10, 9, 4, 9, 10, 9, 5, 6, 22, 4, 6, 20, 8, 4, 6, 11, 8, 16, 12, 7, 9, 14, 23, 4, 7, 9, 14, 4, 6, 11, 10, 12, 4, 10, 12, 4, 19, 7, 15, 12, 73, 26])
        # fmt: on

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            # fmt: off
-            [SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, '<unk>', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.']
+            [SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, 'n', 'i', 'n', 'e', 't', 'y', SPIECE_UNDERLINE, 't', 'w', 'o', SPIECE_UNDERLINE, 't', 'h', 'o', 'u', 's', 'a', 'n', 'd', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.']
            # fmt: on
        )