Exposing prepare_for_model for both slow & fast tokenizers (#5479)

* Exposing prepare_for_model for both slow & fast tokenizers * Update method signature * The traditional style commit * Hide the warnings behind the verbose flag * update default truncation strategy and prepare_for_model * fix tests and prepare_for_models methods Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
2020-07-03 10:51:21 -04:00
parent 814ed7ee76
commit 17ade127b9
4 changed files with 285 additions and 205 deletions
--- a/tests/test_tokenization_fast.py
+++ b/tests/test_tokenization_fast.py
@@ -90,6 +90,7 @@ class CommonFastTokenizerTest(unittest.TestCase):
        self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p)
        self.assert_padding(tokenizer_r, tokenizer_p)
        self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
+        self.assert_prepare_for_model(tokenizer_r, tokenizer_p)
        # TODO: enable for v3.0.0
        # self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p)

@@ -709,6 +710,12 @@ class CommonFastTokenizerTest(unittest.TestCase):
                for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
                    self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)

+    def assert_prepare_for_model(self, tokenizer_r, tokenizer_p):
+        string_sequence = "Asserting that both tokenizers are equal"
+        python_output = tokenizer_p.prepare_for_model(tokenizer_p.encode(string_sequence))
+        rust_output = tokenizer_r.prepare_for_model(tokenizer_r.encode(string_sequence))
+        self.assertEqual(python_output, rust_output)
+

 class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
    """