From 7a5659872a68ce9939c975b5727e5ac61136f256 Mon Sep 17 00:00:00 2001 From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:19:06 +0200 Subject: [PATCH] Mitigate a conflict when using sentencepiece (#33327) * test(tokenizers): add a test showing conflict with sentencepiece This is due to the fact that protobuf C implementation uses a global pool for all added descriptors, so if two different files add descriptors, they will end up conflicting. * fix(tokenizers): mitigate sentencepiece/protobuf conflict When sentencepiece is available, use that protobuf instead of the internal one. * chore(style): fix with ruff --- src/transformers/convert_slow_tokenizer.py | 6 +++++- tests/tokenization/test_tokenization_utils.py | 20 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index f2064a131d..eb75a46a6d 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -26,7 +26,7 @@ from packaging import version from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece -from .utils import is_protobuf_available, logging, requires_backends +from .utils import is_protobuf_available, is_sentencepiece_available, logging, requires_backends from .utils.import_utils import PROTOBUF_IMPORT_ERROR @@ -34,6 +34,10 @@ logger = logging.get_logger(__name__) def import_protobuf(error_message=""): + if is_sentencepiece_available(): + from sentencepiece import sentencepiece_model_pb2 + + return sentencepiece_model_pb2 if is_protobuf_available(): import google.protobuf diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index f97ef6a630..b43923df84 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -35,7 +35,15 @@ from transformers import ( is_tokenizers_available, ) from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer -from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow +from transformers.testing_utils import ( + CaptureStderr, + require_flax, + require_sentencepiece, + require_tf, + require_tokenizers, + require_torch, + slow, +) if is_tokenizers_available(): @@ -296,3 +304,13 @@ class TokenizerUtilsTest(unittest.TestCase): self.assertEqual(len(tokenizer), tokenizer.vocab_size + 1) self.assertEqual(len(tokenizer.added_tokens_decoder), added_tokens_size + 1) self.assertEqual(len(tokenizer.added_tokens_encoder), added_tokens_size + 1) + + @require_sentencepiece + def test_sentencepiece_cohabitation(self): + from sentencepiece import sentencepiece_model_pb2 as _original_protobuf # noqa: F401 + + from transformers.convert_slow_tokenizer import import_protobuf # noqa: F401 + + # Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf + # was already imported. + import_protobuf()