From 95c10fedb338617abd7df959719d517b95b623d4 Mon Sep 17 00:00:00 2001
From: Viktor Scherbakov <viktoroo.sch@gmail.com>
Date: Mon, 25 Nov 2024 18:44:09 +0100
Subject: [PATCH] Updated documentation and added conversion utility (#34319)

* Updated documentation and added conversion utility

* Update docs/source/en/tiktoken.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/tiktoken.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Moved util function to integration folder + allow for str

* Update formatting

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Updated formatting

* style changes

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 docs/source/en/tiktoken.md                | 22 +++++++++++
 src/transformers/integrations/tiktoken.py | 45 +++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 src/transformers/integrations/tiktoken.py

diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md
index 528ff4f76d..aac81e24fd 100644
--- a/docs/source/en/tiktoken.md
+++ b/docs/source/en/tiktoken.md
@@ -36,3 +36,25 @@ from transformers import AutoTokenizer
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") 
 ```
+## Create tiktoken tokenizer
+
+The `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json`, the appropriate format for [`PreTrainedTokenizerFast`].
+
+Generate the `tokenizer.model` file with [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) and then convert it to `tokenizer.json` with [`convert_tiktoken_to_fast`].
+
+```py
+
+from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+from tiktoken import get_encoding
+
+# You can load your custom encoding or the one provided by OpenAI
+encoding = get_encoding("gpt2")
+convert_tiktoken_to_fast(encoding, "config/save/dir")
+```
+
+The resulting `tokenizer.json` file is saved to the specified directory and can be loaded with [`PreTrainedTokenizerFast`].
+
+```py
+tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
+```
+
diff --git a/src/transformers/integrations/tiktoken.py b/src/transformers/integrations/tiktoken.py
new file mode 100644
index 0000000000..60f7339284
--- /dev/null
+++ b/src/transformers/integrations/tiktoken.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+from typing import Any
+
+from transformers.convert_slow_tokenizer import TikTokenConverter
+from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE
+
+
+def convert_tiktoken_to_fast(encoding: Any, output_dir: str):
+    """
+    Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer
+    on disk.
+
+    Args:
+        encoding (`str` or `tiktoken.Encoding`):
+            Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with
+            `tiktoken.get_encoding(encoding)`.
+        output_dir (`str`):
+            Save path for converted tokenizer configuration file.
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE
+    tokenizer_file = output_dir / TOKENIZER_FILE
+
+    save_file_absolute = str(save_file.absolute())
+    output_file_absolute = str(tokenizer_file.absolute())
+
+    try:
+        from tiktoken import get_encoding
+        from tiktoken.load import dump_tiktoken_bpe
+
+        if isinstance(encoding, str):
+            encoding = get_encoding(encoding)
+
+        dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute)
+    except ImportError:
+        raise ValueError(
+            "`tiktoken` is required to save a `tiktoken` file. Install it with " "`pip install tiktoken`."
+        )
+
+    tokenizer = TikTokenConverter(
+        vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens
+    ).tokenizer()
+    tokenizer.save(output_file_absolute)