chore: fix typos in the tests directory (#36813)
* chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * chore: fix typos in the tests * fix: format codes * chore: fix copy mismatch issue * fix: format codes * chore: fix copy mismatch issue * chore: fix copy mismatch issue * chore: fix copy mismatch issue * chore: restore previous words * chore: revert unexpected changes
This commit is contained in:
@@ -2065,21 +2065,21 @@ class TokenizerTesterMixin:
|
||||
|
||||
for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
|
||||
string_to_check = input_full_vocab_string[chunk : chunk + 1024]
|
||||
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
|
||||
with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
|
||||
slow_encode = slow_tokenizer.encode(string_to_check)
|
||||
fast_encode = rust_tokenizer.encode(string_to_check)
|
||||
self.assertEqual(
|
||||
slow_encode,
|
||||
fast_encode,
|
||||
"Hint: the following tokenization diff were obtained for slow vs fast:\n "
|
||||
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
|
||||
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
|
||||
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check)) - set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
|
||||
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check)) - set(slow_tokenizer.tokenize(string_to_check))} \n"
|
||||
f"string used : {string_to_check}",
|
||||
)
|
||||
print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
|
||||
for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
|
||||
ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
|
||||
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
|
||||
with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
|
||||
self.assertEqual(
|
||||
slow_tokenizer.decode(
|
||||
ids_to_decode,
|
||||
@@ -4423,7 +4423,7 @@ class TokenizerTesterMixin:
|
||||
self.assertTrue(
|
||||
find,
|
||||
f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
|
||||
f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
|
||||
f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
|
||||
", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
|
||||
)
|
||||
elif special_token not in special_tokens_map:
|
||||
|
||||
Reference in New Issue
Block a user