[tokenizers] Ensure that add_prefix_space is propagated to backend_tokenizer.pre_tokenizer (#35593)
* Ensure that add_prefix_space is propagated to backend_tokenizer.pre_tokenizer
in PreTrainedTokenizerFast, rather than relying on subclasses to take care of this.
* Simplify setting self.add_prefix_space, ensure pre_tok exists
* Wrap in try-except to catch 'Custom PreTokenizer cannot be serialized'
862d1a346a/bindings/python/src/pre_tokenizers.rs (L672) produces the Exception. They're triggered by the roformer tests, as the RoFormerTokenizerFast uses a custom PreTokenizer.
* Propagate add_prefix_space in T5TokenizerFast to superclass
This commit is contained in:
@@ -4684,3 +4684,15 @@ class TokenizerTesterMixin:
|
||||
|
||||
with self.assertRaises(AttributeError, msg="conflicts with the method"):
|
||||
get_tokenizer_func(get_vocab=True)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_rust_tokenizer_add_prefix_space(self, add_prefix_space):
|
||||
if not self.test_rust_tokenizer:
|
||||
self.skipTest(reason="test_rust_tokenizer is set to False")
|
||||
|
||||
for tokenizer, pretrained_name, _ in self.tokenizers_list:
|
||||
fast_tokenizer = tokenizer.from_pretrained(pretrained_name, add_prefix_space=add_prefix_space)
|
||||
self.assertEqual(fast_tokenizer.add_prefix_space, add_prefix_space)
|
||||
# Only the ByteLevel pre-tokenizer has the `add_prefix_space` attribute, we have to ensure that it's set correctly
|
||||
if hasattr(fast_tokenizer.backend_tokenizer.pre_tokenizer, "add_prefix_space"):
|
||||
self.assertEqual(fast_tokenizer.backend_tokenizer.pre_tokenizer.add_prefix_space, add_prefix_space)
|
||||
|
||||
Reference in New Issue
Block a user