chore: fix typos in the tests directory (#36813)

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* chore: fix typos in the tests

* fix: format codes

* chore: fix copy mismatch issue

* fix: format codes

* chore: fix copy mismatch issue

* chore: fix copy mismatch issue

* chore: fix copy mismatch issue

* chore: restore previous words

* chore: revert unexpected changes
This commit is contained in:
Afanti
2025-03-21 17:20:05 +08:00
committed by GitHub
parent 0adbc873d0
commit 26c83490d2
78 changed files with 181 additions and 148 deletions

View File

@@ -2065,21 +2065,21 @@ class TokenizerTesterMixin:
for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
string_to_check = input_full_vocab_string[chunk : chunk + 1024]
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
slow_encode = slow_tokenizer.encode(string_to_check)
fast_encode = rust_tokenizer.encode(string_to_check)
self.assertEqual(
slow_encode,
fast_encode,
"Hint: the following tokenization diff were obtained for slow vs fast:\n "
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check)) - set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check)) - set(slow_tokenizer.tokenize(string_to_check))} \n"
f"string used : {string_to_check}",
)
print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
self.assertEqual(
slow_tokenizer.decode(
ids_to_decode,
@@ -4423,7 +4423,7 @@ class TokenizerTesterMixin:
self.assertTrue(
find,
f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
)
elif special_token not in special_tokens_map: