From c6379858f39b64a475e3400f184c2651edcea968 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 25 Sep 2024 13:47:20 +0200 Subject: [PATCH] bump tokenizers, fix added tokens fast (#32535) * update based on tokenizers release * update * nits * update * revert re addition * don't break that yet * fmt * revert unwanted * update tokenizers version * update dep table * update * update in conversion script as well * some fix * revert * fully revert * fix training * remove set trace * fixup * update * update --- setup.py | 2 +- src/transformers/convert_slow_tokenizer.py | 33 ++++--------------- src/transformers/dependency_versions_table.py | 2 +- src/transformers/tokenization_utils_fast.py | 33 +++++++++---------- 4 files changed, 24 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index 6ea9b19261..b563b11b62 100644 --- a/setup.py +++ b/setup.py @@ -181,7 +181,7 @@ _deps = [ "timeout-decorator", "tiktoken", "timm<=0.9.16", - "tokenizers>=0.19,<0.20", + "tokenizers>=0.20,<0.21", "torch", "torchaudio", "torchvision", diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index eb75a46a6d..21876c7f61 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -609,33 +609,12 @@ class SpmConverter(Converter): for id, p in enumerate(proto.pieces) if p.type in [3, 4] ] - tokens_to_add = [ - AddedToken(token, normalized=False, special=special) - for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) - ] - - if len(tokens_to_add) > 0: - # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ - # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for - # individual tokens would repeatedly rebuild a trie, which can be slow. - is_last_special = None - tokens = [] - for token in tokens_to_add: - is_special = token.special - if is_last_special is None or is_last_special == is_special: - tokens.append(token) - else: - if is_last_special: - tokenizer.add_special_tokens(tokens) - else: - tokenizer.add_tokens(tokens) - tokens = [token] - is_last_special = is_special - if tokens: - if is_last_special: - tokenizer.add_special_tokens(tokens) - else: - tokenizer.add_tokens(tokens) + tokenizer.add_tokens( + [ + AddedToken(token, normalized=False, special=special) + for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) + ] + ) return tokenizer diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 2634a7b6b3..6564e07903 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -86,7 +86,7 @@ deps = { "timeout-decorator": "timeout-decorator", "tiktoken": "tiktoken", "timm": "timm<=0.9.16", - "tokenizers": "tokenizers>=0.19,<0.20", + "tokenizers": "tokenizers>=0.20,<0.21", "torch": "torch", "torchaudio": "torchaudio", "torchvision": "torchvision", diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 94815caf35..cec91e038d 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -175,15 +175,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): # We call this after having initialized the backend tokenizer because we update it. super().__init__(**kwargs) - - # Set the splitting mode for special tokens for the tokenizer to be used throughout the class. self._tokenizer.encode_special_tokens = self.split_special_tokens - # The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers - # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens - # uses the information stored in `added_tokens_decoder`. - # this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens - # Use hash to speed up the very slow operation `token not in added_tokens_decoder`. added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder} tokens_to_add = [ token @@ -197,10 +190,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ] if len(tokens_to_add) > 0: - # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ - # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for - # individual tokens would repeatedly rebuild a trie, which can be slow. - is_last_special = None tokens = [] special_tokens = self.all_special_tokens for token in tokens_to_add: @@ -209,14 +198,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if isinstance(token, AddedToken) else str(token) in special_tokens ) - if is_last_special is None or is_last_special == is_special: - tokens.append(token) + if isinstance(token, str): + token = AddedToken(token, special=is_special) else: - self._add_tokens(tokens, special_tokens=is_last_special) - tokens = [token] - is_last_special = is_special + token.special = is_special + tokens.append(token) if tokens: - self._add_tokens(tokens, special_tokens=is_last_special) + self.add_tokens(tokens) @property def is_fast(self) -> bool: @@ -849,6 +837,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if special_tokens_map is not None: tokens = [special_tokens_map.get(token, token) for token in tokens] post_processor["special_tokens"][key]["tokens"] = tokens + for token in tokens: + token_id = tokenizer.token_to_id(token) + if token_id is None: + raise ValueError( + "Attempted to set a token in the post processor that does not exist in the mapping" + ) + post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens] for special_token in ["cls", "sep"]: @@ -857,6 +852,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if special_tokens_map is not None and token in special_tokens_map: token = special_tokens_map[token] token_id = tokenizer.token_to_id(token) + if token_id is None: + raise ValueError( + "Attempted to set a token in the post processor that does not exist in the mapping" + ) post_processor[special_token] = [token, token_id] trained_tokenizer_json["post_processor"] = post_processor