diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile index 21e0de8830..1f09626d89 100644 --- a/docker/consistency.dockerfile +++ b/docker/consistency.dockerfile @@ -13,4 +13,4 @@ RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transforme RUN git lfs install RUN pip uninstall -y transformers -RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean \ No newline at end of file diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile index 524a68fd55..710a599abb 100644 --- a/docker/torch-light.dockerfile +++ b/docker/torch-light.dockerfile @@ -6,6 +6,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-de ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu -RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu -RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" +RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken]" RUN pip uninstall -y transformers \ No newline at end of file diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index bf8c20186d..abcfac64d0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -145,6 +145,8 @@ title: Troubleshoot - local: gguf title: Interoperability with GGUF files + - local: tiktoken + title: Interoperability with TikToken files title: Developer guides - sections: - local: quantization/overview diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md new file mode 100644 index 0000000000..528ff4f76d --- /dev/null +++ b/docs/source/en/tiktoken.md @@ -0,0 +1,38 @@ + + +# Tiktoken and interaction with Transformers + +Support for tiktoken model files is seamlessly integrated in 🤗 transformers when loading models +`from_pretrained` with a `tokenizer.model` tiktoken file on the Hub, which is automatically converted into our +[fast tokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). + +### Known models that were released with a `tiktoken.model`: + - gpt2 + - llama3 + +## Example usage + +In order to load `tiktoken` files in `transformers`, ensure that the `tokenizer.model` file is a tiktoken file and it +will automatically be loaded when loading `from_pretrained`. Here is how one would load a tokenizer and a model, which + can be loaded from the exact same file: + +```py +from transformers import AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") +``` diff --git a/setup.py b/setup.py index 2b736d5ddd..43d051df8b 100644 --- a/setup.py +++ b/setup.py @@ -99,6 +99,7 @@ _deps = [ "accelerate>=0.26.0", "av==9.2.0", # Latest version of PyAV (10.0.0) has issues with audio stream. "beautifulsoup4", + "blobfile", "codecarbon==1.2.0", "cookiecutter==1.7.3", "dataclasses", @@ -177,6 +178,7 @@ _deps = [ "tensorflow-probability<0.24", "tf2onnx", "timeout-decorator", + "tiktoken", "timm<=0.9.16", "tokenizers>=0.19,<0.20", "torch", @@ -311,6 +313,7 @@ extras["codecarbon"] = deps_list("codecarbon") extras["video"] = deps_list("decord", "av") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") +extras["tiktoken"] = deps_list("tiktoken", "blobfile") extras["testing"] = ( deps_list( "pytest", diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 2d0302d3f6..297da4f57c 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -26,10 +26,13 @@ from packaging import version from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece -from .utils import is_protobuf_available, requires_backends +from .utils import is_protobuf_available, logging, requires_backends from .utils.import_utils import PROTOBUF_IMPORT_ERROR +logger = logging.get_logger(__name__) + + def import_protobuf(error_message=""): if is_protobuf_available(): import google.protobuf @@ -1451,12 +1454,15 @@ class TikTokenConverter: vocab_file=None, pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, + additional_special_tokens=None, *args, + **kwargs, ): super().__init__(*args) self.vocab_file = vocab_file self.pattern = pattern self.add_prefix_space = add_prefix_space + self.additional_special_tokens = additional_special_tokens def extract_vocab_merges_from_model(self, tiktoken_url: str): try: @@ -1505,7 +1511,10 @@ class TikTokenConverter: ] ) tokenizer.decoder = decoders.ByteLevel() + tokenizer.add_special_tokens(self.additional_special_tokens) + tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) + return tokenizer @@ -1569,7 +1578,7 @@ SLOW_TO_FAST_CONVERTERS = { } -def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: +def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: """ Utilities to convert a slow tokenizer instance in a fast tokenizer instance. @@ -1577,6 +1586,8 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]): Instance of a slow tokenizer to convert in the backend tokenizer for [`~tokenization_utils_base.PreTrainedTokenizerFast`]. + from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece. + Defaults to False. Return: A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a @@ -1584,14 +1595,20 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: """ tokenizer_class_name = transformer_tokenizer.__class__.__name__ + if tokenizer_class_name in SLOW_TO_FAST_CONVERTERS and not from_tiktoken: + converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] + return converter_class(transformer_tokenizer).converted() - if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS: - raise ValueError( - f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance." - " No converter was found. Currently available slow->fast convertors:" - f" {list(SLOW_TO_FAST_CONVERTERS.keys())}" - ) - - converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] - - return converter_class(transformer_tokenizer).converted() + else: + try: + logger.info("Converting from Tiktoken") + return TikTokenConverter( + vocab_file=transformer_tokenizer.vocab_file, + additional_special_tokens=transformer_tokenizer.additional_special_tokens, + ).converted() + except Exception: + raise ValueError( + f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path " + f"with a SentencePiece tokenizer.model file." + f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" + ) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 83fb6797f8..23d686efd5 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -6,6 +6,7 @@ deps = { "accelerate": "accelerate>=0.26.0", "av": "av==9.2.0", "beautifulsoup4": "beautifulsoup4", + "blobfile": "blobfile", "codecarbon": "codecarbon==1.2.0", "cookiecutter": "cookiecutter==1.7.3", "dataclasses": "dataclasses", @@ -82,6 +83,7 @@ deps = { "tensorflow-probability": "tensorflow-probability<0.24", "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", + "tiktoken": "tiktoken", "timm": "timm<=0.9.16", "tokenizers": "tokenizers>=0.19,<0.20", "torch": "torch", diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index b6dfa85b1d..46f0c2f356 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -114,6 +114,7 @@ from .utils import ( is_tensorflow_text_available, is_tf2onnx_available, is_tf_available, + is_tiktoken_available, is_timm_available, is_tokenizers_available, is_torch_available, @@ -1228,6 +1229,13 @@ def require_cython(test_case): return unittest.skipUnless(is_cython_available(), "test requires cython")(test_case) +def require_tiktoken(test_case): + """ + Decorator marking a test that requires TikToken. These tests are skipped when TikToken isn't installed. + """ + return unittest.skipUnless(is_tiktoken_available(), "test requires TikToken")(test_case) + + def get_gpu_count(): """ Return the number of available gpus (regardless of whether torch, tf or jax is used) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 3b2704498c..5e9170456a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -53,6 +53,7 @@ from .utils import ( is_mlx_available, is_numpy_array, is_offline_mode, + is_protobuf_available, is_remote_url, is_tf_available, is_tf_tensor, @@ -65,6 +66,7 @@ from .utils import ( to_py_obj, ) from .utils.chat_template_utils import _compile_jinja_template, _render_with_assistant_indices +from .utils.import_utils import PROTOBUF_IMPORT_ERROR if TYPE_CHECKING: @@ -75,6 +77,16 @@ if TYPE_CHECKING: if is_flax_available(): import jax.numpy as jnp # noqa: F401 + +def import_protobuf_decode_error(error_message=""): + if is_protobuf_available(): + from google.protobuf.message import DecodeError + + return DecodeError + else: + raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message)) + + if is_tokenizers_available(): from tokenizers import AddedToken from tokenizers import Encoding as EncodingFast @@ -2434,6 +2446,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): # Instantiate the tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) + except import_protobuf_decode_error(): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).", + ) + return False + except RuntimeError as e: + if "sentencepiece_processor.cc" in str(e): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).", + ) + return False except OSError: raise OSError( "Unable to load vocabulary from file. " diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index dec3ccd343..7d5446d7cb 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -54,6 +54,7 @@ logger = logging.get_logger(__name__) TOKENIZER_FILE = "tokenizer.json" SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" +TIKTOKEN_VOCAB_FILE = "tokenizer.model" # Slow tokenizers have an additional added tokens files ADDED_TOKENS_FILE = "added_tokens.json" @@ -74,7 +75,7 @@ MODEL_TO_TRAINER_MAPPING = { "WordPiece": WordPieceTrainer, } -VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE} +VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE, "vocab_file": TIKTOKEN_VOCAB_FILE} @add_end_docstrings(INIT_TOKENIZER_DOCSTRING) @@ -113,7 +114,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): elif fast_tokenizer_file is not None and not from_slow: # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) - elif slow_tokenizer is not None: + elif slow_tokenizer: # We need to convert a slow tokenizer to build the backend fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) elif gguf_file is not None: @@ -123,22 +124,26 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): tokenizer_dict = gguf_param["tokenizer"] tokenizer_config = gguf_param["tokenizer_config"] fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict) - kwargs.update(tokenizer_config) if len(additional_kwargs) > 0: kwargs.update(additional_kwargs) - - elif self.slow_tokenizer_class is not None: + elif self.slow_tokenizer_class is not None and slow_tokenizer is not False: # We need to create and convert a slow tokenizer to build the backend slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + elif not slow_tokenizer: + # We tried loading a slow_tokenizer with spm and failed, try to load with tiktoken + self.vocab_file = kwargs.get("vocab_file", None) + self.additional_special_tokens = kwargs.get("additional_special_tokens", []) + fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True) + slow_tokenizer = None else: raise ValueError( "Couldn't instantiate the backend tokenizer from one of: \n" "(1) a `tokenizers` library serialization file, \n" "(2) a slow tokenizer instance to convert or \n" "(3) an equivalent slow tokenizer class to instantiate and convert. \n" - "You need to have sentencepiece installed to convert a slow tokenizer to a fast one." + "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one." ) self._tokenizer = fast_tokenizer diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 27d102a9fd..dc8e8c88f2 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -188,6 +188,7 @@ from .import_utils import ( is_tensorflow_text_available, is_tf2onnx_available, is_tf_available, + is_tiktoken_available, is_timm_available, is_tokenizers_available, is_torch_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 8ae133d0ff..3d03c15894 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -179,6 +179,8 @@ _torchdistx_available = _is_package_available("torchdistx") _torchvision_available = _is_package_available("torchvision") _mlx_available = _is_package_available("mlx") _hqq_available = _is_package_available("hqq") +_tiktoken_available = _is_package_available("tiktoken") +_blobfile_available = _is_package_available("blobfile") _liger_kernel_available = _is_package_available("liger_kernel") @@ -1171,6 +1173,10 @@ def is_mlx_available(): return _mlx_available +def is_tiktoken_available(): + return _tiktoken_available and _blobfile_available + + def is_liger_kernel_available(): if not _liger_kernel_available: return False diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 094a511616..a4b6c8ebeb 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -25,15 +25,19 @@ from huggingface_hub import hf_hub_download from transformers import ( SPIECE_UNDERLINE, AddedToken, + AutoTokenizer, LlamaTokenizer, LlamaTokenizerFast, + PreTrainedTokenizerFast, ) from transformers.convert_slow_tokenizer import convert_slow_tokenizer from transformers.testing_utils import ( get_tests_dir, nested_simplify, require_jinja, + require_read_token, require_sentencepiece, + require_tiktoken, require_tokenizers, require_torch, slow, @@ -832,3 +836,66 @@ class CommonSpmIntegrationTests(unittest.TestCase): self.assertEqual(input_ids, [284, 1, 156]) tokens = self.tokenizer.tokenize("No ▁He") self.assertEqual(tokens, ["▁No", "", "▁He"]) # spaces are eaten by rstrip / lstrip + + +@require_tiktoken +@require_read_token +class TikTokenIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + + def test_tiktoken_llama(self): + model_path = "hf-internal-testing/Llama3-Instruct-Internal" + test_text = "This is a test sentence." + test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] + num_reserved_special_tokens = 256 + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", + "<|python_tag|>", # end of turn + ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] + + tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained( + model_path, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + ) + tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text) + self.assertEqual(tokens[0], "<|begin_of_text|>") + + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, legacy=False, additional_special_tokens=special_tokens, add_bos_token=True, add_eos_token=True + ) + self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast)) + + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True) + self.assertEqual(tokens, test_tokens) + + tmpdirname = tempfile.mkdtemp() + tiktoken_tokenizer.save_pretrained(tmpdirname) + tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname) + + self.assertTrue(isinstance(tokenizer_reload, PreTrainedTokenizerFast)) + tokens = tokenizer_reload.encode(test_text, add_special_tokens=True) + self.assertEqual(tokens, test_tokens) + shutil.rmtree(tmpdirname) + + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + additional_special_tokens=special_tokens, + from_slow=True, + add_bos_token=True, + add_eos_token=True, + ) + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True) + self.assertEqual(tokens, test_tokens)