From 95c10fedb338617abd7df959719d517b95b623d4 Mon Sep 17 00:00:00 2001 From: Viktor Scherbakov Date: Mon, 25 Nov 2024 18:44:09 +0100 Subject: [PATCH] Updated documentation and added conversion utility (#34319) * Updated documentation and added conversion utility * Update docs/source/en/tiktoken.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/tiktoken.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Moved util function to integration folder + allow for str * Update formatting Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Updated formatting * style changes --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- docs/source/en/tiktoken.md | 22 +++++++++++ src/transformers/integrations/tiktoken.py | 45 +++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 src/transformers/integrations/tiktoken.py diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md index 528ff4f76d..aac81e24fd 100644 --- a/docs/source/en/tiktoken.md +++ b/docs/source/en/tiktoken.md @@ -36,3 +36,25 @@ from transformers import AutoTokenizer model_id = "meta-llama/Meta-Llama-3-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") ``` +## Create tiktoken tokenizer + +The `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json`, the appropriate format for [`PreTrainedTokenizerFast`]. + +Generate the `tokenizer.model` file with [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) and then convert it to `tokenizer.json` with [`convert_tiktoken_to_fast`]. + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# You can load your custom encoding or the one provided by OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +The resulting `tokenizer.json` file is saved to the specified directory and can be loaded with [`PreTrainedTokenizerFast`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` + diff --git a/src/transformers/integrations/tiktoken.py b/src/transformers/integrations/tiktoken.py new file mode 100644 index 0000000000..60f7339284 --- /dev/null +++ b/src/transformers/integrations/tiktoken.py @@ -0,0 +1,45 @@ +from pathlib import Path +from typing import Any + +from transformers.convert_slow_tokenizer import TikTokenConverter +from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE + + +def convert_tiktoken_to_fast(encoding: Any, output_dir: str): + """ + Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer + on disk. + + Args: + encoding (`str` or `tiktoken.Encoding`): + Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with + `tiktoken.get_encoding(encoding)`. + output_dir (`str`): + Save path for converted tokenizer configuration file. + """ + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE + tokenizer_file = output_dir / TOKENIZER_FILE + + save_file_absolute = str(save_file.absolute()) + output_file_absolute = str(tokenizer_file.absolute()) + + try: + from tiktoken import get_encoding + from tiktoken.load import dump_tiktoken_bpe + + if isinstance(encoding, str): + encoding = get_encoding(encoding) + + dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute) + except ImportError: + raise ValueError( + "`tiktoken` is required to save a `tiktoken` file. Install it with " "`pip install tiktoken`." + ) + + tokenizer = TikTokenConverter( + vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens + ).tokenizer() + tokenizer.save(output_file_absolute)