chore: fix typos in the tests directory (#36813)

* chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * fix: format codes * chore: fix copy mismatch issue * fix: format codes * chore: fix copy mismatch issue * chore: fix copy mismatch issue * chore: fix copy mismatch issue * chore: restore previous words * chore: revert unexpected changes
2025-03-21 17:20:05 +08:00
parent 0adbc873d0
commit 26c83490d2
78 changed files with 181 additions and 148 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -2065,21 +2065,21 @@ class TokenizerTesterMixin:

                for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
                    string_to_check = input_full_vocab_string[chunk : chunk + 1024]
-                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                    with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
                        slow_encode = slow_tokenizer.encode(string_to_check)
                        fast_encode = rust_tokenizer.encode(string_to_check)
                        self.assertEqual(
                            slow_encode,
                            fast_encode,
                            "Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
-                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
+                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check)) - set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
+                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check)) - set(slow_tokenizer.tokenize(string_to_check))} \n"
                            f"string used     : {string_to_check}",
                        )
                print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
                for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
                    ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
-                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                    with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
                        self.assertEqual(
                            slow_tokenizer.decode(
                                ids_to_decode,
@@ -4423,7 +4423,7 @@ class TokenizerTesterMixin:
                self.assertTrue(
                    find,
                    f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
-                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
                    ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
                )
            elif special_token not in special_tokens_map: