Enable option for subword regularization in XLMRobertaTokenizer (#11149)

* enable subword regularization. * fix tokenizer storage * fix docstring formatting * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Stefan Schweter <stefan@schweter.it> * fix docstring formatting * add test for subword regularization tokenizer * improve comments of test * add sp_model_kwargs * reformat docstring to match the style * add some more documentation * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * improve docstring * empty commit to trigger CI * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * fix docstring formatting for sphinx Co-authored-by: Stefan Schweter <stefan@schweter.it> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-04-23 23:52:31 +02:00
parent 1ef152eb48
commit 195bfd118a
2 changed files with 44 additions and 2 deletions
--- a/tests/test_tokenization_xlm_roberta.py
+++ b/tests/test_tokenization_xlm_roberta.py
@@ -14,6 +14,7 @@
 # limitations under the License.


+import itertools
 import os
 import unittest

@@ -118,6 +119,29 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            ],
        )

+    def test_subword_regularization_tokenizer(self):
+        # Subword regularization is only available for the slow tokenizer.
+        tokenizer = XLMRobertaTokenizer(
+            SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
+        )
+
+        # Subword regularization augments training data with subword sampling.
+        # This has a random component. We test if the tokenizer generates different
+        # results when subword regularization is enabled.
+        tokens_list = []
+        for _ in range(5):
+            tokens_list.append(tokenizer.tokenize("This is a test for subword regularization."))
+
+        # the list of different pairs of tokens_list
+        combinations = itertools.combinations(tokens_list, 2)
+
+        all_equal = True
+        for combination in combinations:
+            if combination[0] != combination[1]:
+                all_equal = False
+
+        self.assertFalse(all_equal)
+
    @cached_property
    def big_tokenizer(self):
        return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")