Mitigate a conflict when using sentencepiece (#33327)
* test(tokenizers): add a test showing conflict with sentencepiece This is due to the fact that protobuf C implementation uses a global pool for all added descriptors, so if two different files add descriptors, they will end up conflicting. * fix(tokenizers): mitigate sentencepiece/protobuf conflict When sentencepiece is available, use that protobuf instead of the internal one. * chore(style): fix with ruff
This commit is contained in:
@@ -35,7 +35,15 @@ from transformers import (
|
||||
is_tokenizers_available,
|
||||
)
|
||||
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
|
||||
from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow
|
||||
from transformers.testing_utils import (
|
||||
CaptureStderr,
|
||||
require_flax,
|
||||
require_sentencepiece,
|
||||
require_tf,
|
||||
require_tokenizers,
|
||||
require_torch,
|
||||
slow,
|
||||
)
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
@@ -296,3 +304,13 @@ class TokenizerUtilsTest(unittest.TestCase):
|
||||
self.assertEqual(len(tokenizer), tokenizer.vocab_size + 1)
|
||||
self.assertEqual(len(tokenizer.added_tokens_decoder), added_tokens_size + 1)
|
||||
self.assertEqual(len(tokenizer.added_tokens_encoder), added_tokens_size + 1)
|
||||
|
||||
@require_sentencepiece
|
||||
def test_sentencepiece_cohabitation(self):
|
||||
from sentencepiece import sentencepiece_model_pb2 as _original_protobuf # noqa: F401
|
||||
|
||||
from transformers.convert_slow_tokenizer import import_protobuf # noqa: F401
|
||||
|
||||
# Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
|
||||
# was already imported.
|
||||
import_protobuf()
|
||||
|
||||
Reference in New Issue
Block a user