From 8e5f76f51196bb3b537a8d58c986819bb103f0a8 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 17 Apr 2024 17:17:50 +0200 Subject: [PATCH] Upgrading to tokenizers 0.19.0 (#30289) * [DO NOT MERGE] Testing tokenizers 0.19.0rc0 * Accounting for the breaking change. * Ruff. * Upgrading to tokenizers `0.19` (new release with preprend_scheme fixed and new surface for BPE tiktoken bug). --- .../language-modeling/t5_tokenizer_model.py | 8 ++++-- setup.py | 2 +- src/transformers/convert_slow_tokenizer.py | 27 ++++++++++++------- src/transformers/dependency_versions_table.py | 2 +- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/examples/flax/language-modeling/t5_tokenizer_model.py b/examples/flax/language-modeling/t5_tokenizer_model.py index fbccd52bd8..b55c2c95d9 100755 --- a/examples/flax/language-modeling/t5_tokenizer_model.py +++ b/examples/flax/language-modeling/t5_tokenizer_model.py @@ -46,12 +46,16 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ - pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), + pre_tokenizers.Metaspace( + replacement=replacement, add_prefix_space="always" if add_prefix_space else "never" + ), pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Punctuation(), ] ) - tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + tokenizer.decoder = decoders.Metaspace( + replacement=replacement, add_prefix_space="always" if add_prefix_space else "never" + ) tokenizer.post_processor = TemplateProcessing( single=f"$A {self.special_tokens['eos']['token']}", diff --git a/setup.py b/setup.py index 95aeea6d3a..9278a8465c 100644 --- a/setup.py +++ b/setup.py @@ -174,7 +174,7 @@ _deps = [ "tf2onnx", "timeout-decorator", "timm", - "tokenizers>=0.14,<0.19", + "tokenizers>=0.19,<0.20", "torch", "torchaudio", "torchvision", diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 1980ba643a..88f9e5f19a 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -43,6 +43,16 @@ def import_protobuf(error_message=""): raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message)) +def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: + if add_prefix_space: + prepend_scheme = "always" + if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy: + prepend_scheme = "first" + else: + prepend_scheme = "never" + return prepend_scheme + + class SentencePieceExtractor: """ Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece @@ -597,18 +607,15 @@ class SpmConverter(Converter): return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers) def pre_tokenizer(self, replacement, add_prefix_space): - prepend_scheme = "always" - if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy: - prepend_scheme = "first" - return pre_tokenizers.Metaspace( - replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme - ) + prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) + return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) def post_processor(self): return None def decoder(self, replacement, add_prefix_space): - return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) + return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) def converted(self) -> Tokenizer: tokenizer = self.tokenizer(self.proto) @@ -722,7 +729,8 @@ class DebertaV2Converter(SpmConverter): list_pretokenizers = [] if self.original_tokenizer.split_by_punct: list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated")) - list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)) + prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) + list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)) return pre_tokenizers.Sequence(list_pretokenizers) def normalizer(self, proto): @@ -1007,10 +1015,11 @@ class PegasusConverter(SpmConverter): return proto.trainer_spec.unk_id + self.original_tokenizer.offset def pre_tokenizer(self, replacement, add_prefix_space): + prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer) return pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), - pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), + pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme), ] ) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index d70b717f0d..d40cae189a 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -79,7 +79,7 @@ deps = { "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", - "tokenizers": "tokenizers>=0.14,<0.19", + "tokenizers": "tokenizers>=0.19,<0.20", "torch": "torch", "torchaudio": "torchaudio", "torchvision": "torchvision",