Mitigate a conflict when using sentencepiece (#33327)

* test(tokenizers): add a test showing conflict with sentencepiece

This is due to the fact that protobuf C implementation uses a global
pool for all added descriptors, so if two different files add
descriptors, they will end up conflicting.

* fix(tokenizers): mitigate sentencepiece/protobuf conflict

When sentencepiece is available, use that protobuf instead of the
internal one.

* chore(style): fix with ruff
This commit is contained in:
Alvaro Moran
2024-09-13 13:19:06 +02:00
committed by GitHub
parent 4b0418df11
commit 7a5659872a
2 changed files with 24 additions and 2 deletions

View File

@@ -26,7 +26,7 @@ from packaging import version
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece
from .utils import is_protobuf_available, logging, requires_backends
from .utils import is_protobuf_available, is_sentencepiece_available, logging, requires_backends
from .utils.import_utils import PROTOBUF_IMPORT_ERROR
@@ -34,6 +34,10 @@ logger = logging.get_logger(__name__)
def import_protobuf(error_message=""):
if is_sentencepiece_available():
from sentencepiece import sentencepiece_model_pb2
return sentencepiece_model_pb2
if is_protobuf_available():
import google.protobuf

View File

@@ -35,7 +35,15 @@ from transformers import (
is_tokenizers_available,
)
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow
from transformers.testing_utils import (
CaptureStderr,
require_flax,
require_sentencepiece,
require_tf,
require_tokenizers,
require_torch,
slow,
)
if is_tokenizers_available():
@@ -296,3 +304,13 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertEqual(len(tokenizer), tokenizer.vocab_size + 1)
self.assertEqual(len(tokenizer.added_tokens_decoder), added_tokens_size + 1)
self.assertEqual(len(tokenizer.added_tokens_encoder), added_tokens_size + 1)
@require_sentencepiece
def test_sentencepiece_cohabitation(self):
from sentencepiece import sentencepiece_model_pb2 as _original_protobuf # noqa: F401
from transformers.convert_slow_tokenizer import import_protobuf # noqa: F401
# Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
# was already imported.
import_protobuf()