Enable option for subword regularization in more tokenizers. (#11417)

* improve slow class tok usage at xlm rob * add subword regularization for barthez * improve barthez tok. test * fix tokenizer tests * add subword regularization for camembert * add subword regularization for deberta v2 tokenizer * add more doc to deberta v2 tokenizer * add subword regularization for speech to text tok. * fix sp_model_kwargs type in speech 2 text tok. * add subword regularization for M2M100 tok. * add more concrete type hints * fix tests for m2m100 and s2t tok. * add missing Any import * fix syntax error in m2m100 tok. * fix unpickle of m2m100 and s2t tok. * fix test of m2m100 and s2t tok. * improve unpickle of deberta v2 tok. * add test for pickle of barthez & camembert * fix pickle of barthez & camembert * add test for deberta v2 tok. pickle * fix m2m100 tok. pickle * fix s2t tok. pickle * add subword regularization to albert tok. * refactor subword reg. test into TokenizerTesterMixin improve albert tok. test remove sample argument form albert tok. check subword reg. using TokenizerTesterMixin improve tok. tests improve xlm roberta tok. tests improve xlm roberta tok. tests * add subword regularization for big bird t. * improve xlm roberta tok. test * add subword regularization for mbart50 tok. * add subword regularization for pegasus tok. * add subword regularization for reformer tok. * add subword regularization for T5 tok. * fix t5 tok. test formatting * add subword regularization for xlm_proph. tok. * add subword regularization for xlnet tok. * add subword regularization for gert_gen tok. * add typing to tokenizers * add typing to xlm rob. tok * add subword regularization for marian tok. * add reverse tok. test * fix marian tok test * fix marian tok test * fix casing in tok. tests * fix style of tok. common test * fix deberta v2 tok test * add type annotations to tok. tests * add type annotations to tok. __init__ * add typing to kokenizer * add type annotations to tok. __init__ * don't specify the default when it's None * fix barthez tok. doc * move sentencepiece tok. tests to TokenizerTesterMixin * fix unused imports * fix albert tok. test * add comment to sentencepiece test options * fix Any import at big bird tok. * fix Any import at xlm prophetnet tok. * empty commit to trigger CI
2021-05-13 08:44:55 +02:00
parent fa84540e98
commit 37ed3ab719
33 changed files with 578 additions and 181 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -15,6 +15,7 @@


 import inspect
+import itertools
 import os
 import pickle
 import re
@@ -100,6 +101,13 @@ class TokenizerTesterMixin:
    from_pretrained_vocab_key = "vocab_file"
    test_seq2seq = True

+    # set to True to test a sentencepiece tokenizer
+    test_sentencepiece = False
+
+    # set to True to ignore casing when testing a sentencepiece tokenizer
+    # test_sentencepiece must also be set to True
+    test_sentencepiece_ignore_case = False
+
    def setUp(self) -> None:
        # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
        # information available in Tokenizer (name, rust class, python class, vocab key name)
@@ -216,6 +224,38 @@ class TokenizerTesterMixin:
            for i in range(len(batch_encode_plus_sequences["input_ids"]))
        ]

+    def test_subword_regularization_tokenizer(self) -> None:
+        if not self.test_sentencepiece:
+            return
+
+        # Subword regularization is only available for the slow tokenizer.
+        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
+        tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
+
+        self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
+        self.assertIsNotNone(tokenizer.sp_model_kwargs)
+        self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
+        self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
+        self.check_subword_sampling(tokenizer)
+
+    def test_pickle_subword_regularization_tokenizer(self) -> None:
+        if not self.test_sentencepiece:
+            return
+
+        """Google pickle __getstate__ __setstate__ if you are struggling with this."""
+        # Subword regularization is only available for the slow tokenizer.
+        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
+        tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
+        tokenizer_bin = pickle.dumps(tokenizer)
+        del tokenizer
+        tokenizer_new = pickle.loads(tokenizer_bin)
+
+        self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
+        self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
+        self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
+        self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
+        self.check_subword_sampling(tokenizer_new)
+
    def test_model_input_names_signature(self):
        accepted_model_main_input_names = [
            "input_ids",  # nlp models
@@ -1727,6 +1767,46 @@ class TokenizerTesterMixin:
            # add pad_token_id to pass subsequent tests
            tokenizer.add_special_tokens({"pad_token": "<PAD>"})

+    def check_subword_sampling(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        text: str = None,
+    ) -> None:
+        """
+        Check if the tokenizer generates different results when subword regularization is enabled.
+
+        Subword regularization augments training data with subword sampling.
+        This has a random component.
+
+        Args:
+            tokenizer: The tokenizer to check.
+            text: The text to use for the checks.
+        """
+        text = "This is a test for subword regularization." if text is None else text
+        if self.test_sentencepiece_ignore_case:
+            text = text.lower()
+
+        tokens_list = []
+        for _ in range(5):
+            tokens_list.append(tokenizer.tokenize(text))
+
+        # the list of different pairs of tokens_list
+        combinations = itertools.combinations(tokens_list, 2)
+
+        # check of sampling is done
+        subword_sampling_found = False
+        for combination in combinations:
+            if combination[0] != combination[1]:
+                subword_sampling_found = True
+        self.assertTrue(subword_sampling_found)
+
+        # check if converting back to original text works
+        for tokens in tokens_list:
+            if self.test_sentencepiece_ignore_case:
+                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
+            else:
+                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
+
    @require_torch
    @slow
    def test_torch_encode_plus_sent_to_model(self):