diff --git a/conftest.py b/conftest.py index 6b3100c84e..67e6eddfb8 100644 --- a/conftest.py +++ b/conftest.py @@ -28,6 +28,7 @@ from transformers.testing_utils import HfDoctestModule, HfDocTestParser NOT_DEVICE_TESTS = { "test_tokenization", + "test_tokenization_mistral_common", "test_processor", "test_processing", "test_beam_constraints", diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index f41a486dbb..ba60eda429 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -139,6 +139,10 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl [[autodoc]] MistralConfig +## MistralCommonTokenizer + +[[autodoc]] MistralCommonTokenizer + ## MistralModel [[autodoc]] MistralModel diff --git a/docs/source/en/model_doc/mistral3.md b/docs/source/en/model_doc/mistral3.md index 8eedb5de6b..37cf3e8b20 100644 --- a/docs/source/en/model_doc/mistral3.md +++ b/docs/source/en/model_doc/mistral3.md @@ -227,6 +227,10 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati [[autodoc]] Mistral3Config +## MistralCommonTokenizer + +[[autodoc]] MistralCommonTokenizer + ## Mistral3Model [[autodoc]] Mistral3Model diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index e0688f35be..8b07aff7fa 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -197,6 +197,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] MixtralConfig +## MistralCommonTokenizer + +[[autodoc]] MistralCommonTokenizer + ## MixtralModel [[autodoc]] MixtralModel diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index f287170a0e..6adac0277f 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -86,6 +86,10 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up [[autodoc]] PixtralVisionConfig +## MistralCommonTokenizer + +[[autodoc]] MistralCommonTokenizer + ## PixtralVisionModel [[autodoc]] PixtralVisionModel diff --git a/setup.py b/setup.py index 11e81ec4ee..ff84d79364 100644 --- a/setup.py +++ b/setup.py @@ -204,6 +204,7 @@ _deps = [ "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", + "mistral-common[opencv]>=1.6.3", ] @@ -334,6 +335,7 @@ extras["video"] = deps_list("av") extras["num2words"] = deps_list("num2words") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["tiktoken"] = deps_list("tiktoken", "blobfile") +extras["mistral-common"] = deps_list("mistral-common[opencv]") extras["testing"] = ( deps_list( "pytest", @@ -363,6 +365,7 @@ extras["testing"] = ( ) + extras["retrieval"] + extras["modelcreation"] + + extras["mistral-common"] ) extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"] @@ -384,6 +387,7 @@ extras["all"] = ( + extras["accelerate"] + extras["video"] + extras["num2words"] + + extras["mistral-common"] ) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 892acd32ea..9120d32017 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -34,6 +34,7 @@ from .utils import ( is_g2p_en_available, is_keras_nlp_available, is_librosa_available, + is_mistral_common_available, is_pretty_midi_available, is_scipy_available, is_sentencepiece_available, @@ -310,6 +311,18 @@ else: "convert_slow_tokenizer", ] +try: + if not (is_mistral_common_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_mistral_common_objects + + _import_structure["utils.dummy_mistral_common_objects"] = [ + name for name in dir(dummy_mistral_common_objects) if not name.startswith("_") + ] +else: + _import_structure["tokenization_mistral_common"] = ["MistralCommonTokenizer"] + # Vision-specific objects try: if not is_vision_available(): diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 5e8b2b6598..acd8ec7fb4 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -106,4 +106,5 @@ deps = { "opentelemetry-api": "opentelemetry-api", "opentelemetry-exporter-otlp": "opentelemetry-exporter-otlp", "opentelemetry-sdk": "opentelemetry-sdk", + "mistral-common[opencv]": "mistral-common[opencv]>=1.6.3", } diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 7c3a8b7c53..0543bfd062 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -21,6 +21,8 @@ import warnings from collections import OrderedDict from typing import Any, Optional, Union +from transformers.utils.import_utils import is_mistral_common_available + from ...configuration_utils import PretrainedConfig from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint @@ -387,15 +389,19 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]]( ( "mistral", ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, + "MistralCommonTokenizer" + if is_mistral_common_available() + else ("LlamaTokenizer" if is_sentencepiece_available() else None), + "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, ), ), ( "mixtral", ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, + "MistralCommonTokenizer" + if is_mistral_common_available() + else ("LlamaTokenizer" if is_sentencepiece_available() else None), + "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, ), ), ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), @@ -490,7 +496,15 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]]( ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("phobert", ("PhobertTokenizer", None)), ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), - ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ( + "pixtral", + ( + None, + "MistralCommonTokenizer" + if is_mistral_common_available() + else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ), + ), ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), ("prophetnet", ("ProphetNetTokenizer", None)), ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), @@ -721,8 +735,10 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]: for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): if class_name in tokenizers: module_name = model_type_to_module_name(module_name) - - module = importlib.import_module(f".{module_name}", "transformers.models") + if module_name in ["mistral", "mixtral"] and class_name == "MistralCommonTokenizer": + module = importlib.import_module(".tokenization_mistral_common", "transformers") + else: + module = importlib.import_module(f".{module_name}", "transformers.models") try: return getattr(module, class_name) except AttributeError: diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 3dc29371ba..38b4905b37 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -108,6 +108,7 @@ from .utils import ( is_librosa_available, is_liger_kernel_available, is_lomo_available, + is_mistral_common_available, is_natten_available, is_nltk_available, is_onnx_available, @@ -1526,6 +1527,13 @@ def require_speech(test_case): return unittest.skipUnless(is_speech_available(), "test requires torchaudio")(test_case) +def require_mistral_common(test_case): + """ + Decorator marking a test that requires mistral-common. These tests are skipped when mistral-common isn't available. + """ + return unittest.skipUnless(is_mistral_common_available(), "test requires mistral-common")(test_case) + + def get_gpu_count(): """ Return the number of available gpus (regardless of whether torch, tf or jax is used) diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py new file mode 100644 index 0000000000..bf5f61ae00 --- /dev/null +++ b/src/transformers/tokenization_mistral_common.py @@ -0,0 +1,1830 @@ +# Copyright 2025 Mistral AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import warnings +from collections.abc import Mapping, Sized +from enum import Enum +from pathlib import Path +from typing import Any, Callable, Optional, Union, overload + +import numpy as np + +from transformers.tokenization_utils_base import ( + LARGE_INTEGER, + VERY_LARGE_INTEGER, + BatchEncoding, + EncodedInput, + PreTokenizedInput, + PreTrainedTokenizerBase, + TextInput, + TruncationStrategy, +) +from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj +from transformers.utils.generic import is_torch_tensor +from transformers.utils.hub import PushToHubMixin +from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires + + +if is_mistral_common_available(): + from mistral_common.protocol.instruct.request import ChatCompletionRequest + from mistral_common.protocol.instruct.validator import ValidationMode + from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + from mistral_common.tokens.tokenizers.tekken import Tekkenizer + from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +ENCODE_KWARGS_DOCSTRING = r""" + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to add special tokens when encoding the sequences. This will use the underlying + `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are + automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens + automatically. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence is provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (`int`, *optional*, defaults to 0): + If set to a number along with `max_length`, the overflowing tokens returned when + `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'pt'`: Return PyTorch `torch.Tensor` objects. +""" + +ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_overflowing_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead + of returning overflowing tokens. + return_special_tokens_mask (`bool`, *optional*, defaults to `False`): + Whether or not to return special tokens mask information. + return_offsets_mapping (`bool`, *optional*, defaults to `False`): + Whether or not to return `(char_start, char_end)` for each token. + + This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using + Python's tokenizer, this method will raise `NotImplementedError`. + return_length (`bool`, *optional*, defaults to `False`): + Whether or not to return the lengths of the encoded inputs. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + **kwargs: passed to the `self.tokenize()` method + + Return: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + + [What are input IDs?](../glossary#input-ids) + + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`). + + [What are attention masks?](../glossary#attention-mask) + + - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying + regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`). + - **length** -- The length of the inputs (when `return_length=True`) +""" + + +class MistralTokenizerType(str, Enum): + """Enum for the different type of tokenizer.""" + + spm = "spm" + tekken = "tekken" + + +@requires(backends=("mistral-common",)) +class MistralCommonTokenizer(PushToHubMixin): + """ + Class to wrap `mistral-common` tokenizers. + + `mistral-common` is the official tokenizer library for Mistral AI models. To use it, you need to install it with: + + ```bash + pip install transformers[mistral-common] + ``` + + Otherwise the tokenizer falls back to the Transformers implementation of the tokenizer. + + For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common). + + This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`. + It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer. + + Supports the following methods from the `PreTrainedTokenizerBase` class: + + - [`~MistralCommonTokenizer.get_vocab`]: Returns the vocabulary as a dictionary of token to index. + - [`~MistralCommonTokenizer.encode`]: Encode a string to a list of integers. + - [`~MistralCommonTokenizer.decode`]: Decode a list of integers to a string. + - [`~MistralCommonTokenizer.batch_decode`]: Decode a batch of list of integers to a list of strings. + - [`~MistralCommonTokenizer.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers. + - [`~MistralCommonTokenizer.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens. + - [`~MistralCommonTokenizer.tokenize`]: Tokenize a string. + - [`~MistralCommonTokenizer.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens. + - [`~MistralCommonTokenizer.prepare_for_model`]: Prepare a list of inputs for the model. + - [`~MistralCommonTokenizer.pad`]: Pad a list of inputs to the same length. + - [`~MistralCommonTokenizer.truncate_sequences`]: Truncate a list of sequences to the same length. + - [`~MistralCommonTokenizer.apply_chat_template`]: Apply a chat template to a list of messages. + - [`~MistralCommonTokenizer.__call__`]: Tokenize a string or a list of strings. + - [`~MistralCommonTokenizer.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory. + - [`~MistralCommonTokenizer.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method. + - [`~MistralCommonTokenizer.push_to_hub`]: Upload tokenizer to the Hugging Face model hub. + + Here are the key differences with the `PreTrainedTokenizerBase` class: + + - Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`. + - The `is_split_into_words` argument is not supported. + - The `return_token_type_ids` argument is not supported. + - It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("")` will not return the ID of the `` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `""`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens). + + If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface. + """ + + model_input_names: list[str] = ["input_ids", "attention_mask"] + padding_side: str = "left" + truncation_side: str = "right" + + def __init__( + self, + tokenizer_path: Union[str, os.PathLike, Path], + mode: ValidationMode = ValidationMode.test, + model_max_length: int = VERY_LARGE_INTEGER, + padding_side: str = "left", + truncation_side: str = "right", + model_input_names: Optional[list[str]] = None, + clean_up_tokenization_spaces: bool = False, + **kwargs, + ): + """ + Constructs a `MistralCommonTokenizer`. + + - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model. + - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied. + Should be `'right'` or `'left'`. + - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation + applied. Should be `'right'` or `'left'`. + + Args: + tokenizer_path (`str` or `os.PathLike` or `Path`): + Path to the tokenizer file to load the `MistralTokenizer`. + mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`): + The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. + model_max_length (`int`, *optional*): + The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is + loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the + value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will + default to VERY_LARGE_INTEGER (`int(1e30)`). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + truncation_side (`str`, *optional*): + The side on which the model should have truncation applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + model_input_names (`List[string]`, *optional*): + The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or + `"attention_mask"`). Default value is picked from the class attribute of the same name. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not the model should cleanup the spaces that were added when splitting the input text during the + tokenization process. + """ + if kwargs: + raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonTokenizer`.") + + self._tokenizer_path = Path(tokenizer_path) + self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=mode) + self._tokenizer_type = ( + MistralTokenizerType.tekken + if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer) + else MistralTokenizerType.spm + ) + self.truncation_side = truncation_side + self.padding_side = padding_side + self.model_max_length = model_max_length + self.cleanup_tokenization_spaces = clean_up_tokenization_spaces + self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging). + + if model_input_names is not None: + if ( + not isinstance(model_input_names, (list, tuple)) + and len(model_input_names) == 0 + and not all(isinstance(i, str) for i in model_input_names) + ): + raise ValueError( + "`model_input_names` should be a non-empty list or tuple of str but got an empty value." + ) + self.model_input_names = model_input_names + + self._cache_get_vocab: Optional[dict[str, int]] = None + + @property + def bos_token_id(self) -> int: + """ + Id of the beginning of sentence token in the vocabulary. + """ + return self.tokenizer.instruct_tokenizer.tokenizer.bos_id + + @property + def eos_token_id(self) -> int: + """ + Id of the end of sentence token in the vocabulary. + """ + return self.tokenizer.instruct_tokenizer.tokenizer.eos_id + + @property + def unk_token_id(self) -> int: + """ + Id of the unknown token in the vocabulary. + """ + return self.tokenizer.instruct_tokenizer.tokenizer.unk_id + + @property + def pad_token_id(self) -> int: + """ + Id of the padding token in the vocabulary. + """ + return self.tokenizer.instruct_tokenizer.tokenizer.pad_id + + @property + def bos_token(self) -> str: + """ + String associated to the beginning of sentence token in the vocabulary. + """ + return self.convert_ids_to_tokens(self.bos_token_id) + + @property + def eos_token(self) -> str: + """ + String associated to the end of sentence token in the vocabulary. + """ + return self.convert_ids_to_tokens(self.eos_token_id) + + @property + def unk_token(self) -> str: + """ + String associated to the unknown token in the vocabulary. + """ + return self.convert_ids_to_tokens(self.unk_token_id) + + @property + def pad_token(self) -> str: + """ + String associated to the padding token in the vocabulary. + """ + return self.convert_ids_to_tokens(self.pad_token_id) + + @property + def vocab_size(self) -> int: + """ + Returns the size of the vocabulary. + + `int`: Size of the vocabulary. + """ + return self.tokenizer.instruct_tokenizer.tokenizer.n_words + + def get_vocab(self) -> dict[str, int]: + """ + Returns the vocabulary as a dictionary of token to index. + + This is a lossy conversion. There may be multiple token ids that decode to the same + string due to partial UTF-8 byte sequences being converted to �. + + Returns: + `Dict[str, int]`: The vocabulary. + """ + if self._cache_get_vocab is None: + self._cache_get_vocab = { + token: idx for idx, token in enumerate(self.tokenizer.instruct_tokenizer.tokenizer.vocab()) + } + return self._cache_get_vocab + + def __len__(self): + """ + Size of the full vocabulary with the added tokens. + """ + return self.vocab_size + + @add_end_docstrings( + ENCODE_KWARGS_DOCSTRING, + """ + **kwargs: Not supported by `MistralCommonTokenizer.encode`. + Will raise an error if used. + """, + """ + Returns: + `List[int]`, `torch.Tensor`: The tokenized ids of the text. + """, + ) + def encode( + self, + text: Union[TextInput, EncodedInput], + text_pair: None = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + verbose: bool = True, + **kwargs, + ) -> list[int]: + """ + Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. + + Args: + text (`str` or `List[int]`): + The first sequence to be encoded. This can be a string or a list of integers (tokenized string ids). + text_pair (`None`, *optional*): + Not supported by `MistralCommonTokenizer.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature. + """ + if kwargs: + raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.encode`.") + if text_pair: + raise ValueError("`MistralCommonTokenizer.encode` does not support `text_pair`.") + + padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + ) + + encoded_inputs = self._encode_plus( + text, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_attention_mask=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + return_length=False, + verbose=verbose, + ) + + return encoded_inputs["input_ids"] + + def decode( + self, + token_ids: Union[int, list[int], "np.ndarray", "torch.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + **kwargs, + ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special + tokens and clean up tokenization spaces. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces`. + kwargs (additional keyword arguments, *optional*): + Not supported by `MistralCommonTokenizer.decode`. + Will raise an error if used. + + Returns: + `str`: The decoded sentence. + """ + if kwargs: + raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.decode`.") + + clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces + + # Convert inputs to python lists + token_ids = to_py_obj(token_ids) + + special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP + + decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy) + if clean_up_tokenization_spaces: + decoded_string = PreTrainedTokenizerBase.clean_up_tokenization(decoded_string) + + return decoded_string + + def batch_decode( + self, + sequences: Union[list[int], list[list[int]], "np.ndarray", "torch.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + **kwargs, + ) -> list[str]: + """ + Convert a list of lists of token ids into a list of strings by calling decode. + + Args: + sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces`. + kwargs (additional keyword arguments, *optional*): + Not supported by `MistralCommonTokenizer.batch_decode`. + Will raise an error if used. + + Returns: + `List[str]`: The list of decoded sentences. + """ + return [ + self.decode( + seq, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + for seq in sequences + ] + + def _is_control_token(self, token_id: int) -> bool: + if self._tokenizer_type == MistralTokenizerType.spm: + return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens() + elif self._tokenizer_type == MistralTokenizerType.tekken: + return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens + else: + raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}") + + @overload + def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ... + @overload + def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ... + def convert_ids_to_tokens( + self, ids: Union[int, list[int]], skip_special_tokens: bool = False + ) -> Union[str, list[str]]: + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and + added tokens. + + Args: + ids (`int` or `List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + + Returns: + `str` or `List[str]`: The decoded token(s). + """ + + if isinstance(ids, int): + one_token = True + ids = [ids] + else: + one_token = False + + tokens: list[str] = [] + for token_id in ids: + if self._is_control_token(token_id) and skip_special_tokens: + continue + tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id)) + + if one_token: + if tokens == []: + raise ValueError(f"Invalid token id {ids}.") + + return tokens[0] + return tokens + + def _piece_to_id(self, piece: str) -> int: + if self._tokenizer_type == MistralTokenizerType.spm: + return self.tokenizer.instruct_tokenizer.tokenizer._model.piece_to_id(piece) + elif self._tokenizer_type == MistralTokenizerType.tekken: + pieces = self.tokenizer.instruct_tokenizer.tokenizer._model.encode( + piece, allowed_special="all", disallowed_special=set() + ) + assert len(pieces) == 1, f"Expected to decode 1 token, got {len(pieces)}" + return pieces[0] + else: + raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}") + + def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]: + """ + Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + vocabulary. + + Args: + tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). + + Returns: + `int` or `List[int]`: The token id or list of token ids. + """ + + if isinstance(tokens, str): + one_token = True + tokens = [tokens] + else: + one_token = False + + ids: list[int] = [] + for token in tokens: + ids.append(self._piece_to_id(token)) + + if one_token: + return ids[0] + return ids + + def _text_to_ids(self, text: TextInput, add_special_tokens: bool) -> list[int]: + """ + Converts a string into a sequence of tokens ids, using the tokenizer. + """ + tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode( + text, bos=add_special_tokens, eos=add_special_tokens + ) + return tokens_ids + + def tokenize(self, text: TextInput, **kwargs) -> list[str]: + """ + Converts a string into a sequence of tokens, using the tokenizer. + + Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies. + + Args: + text (`str`): + The sequence to be encoded. + **kwargs (additional keyword arguments): + Not supported by `MistralCommonTokenizer.tokenize`. + Will raise an error if used. + + Returns: + `List[str]`: The list of tokens. + """ + if kwargs: + raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.tokenize`.") + + return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False) + + def _encode_plus( + self, + text: Union[TextInput, EncodedInput], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if kwargs: + raise ValueError( + f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer._encode_plus`." + ) + + def get_input_ids(text): + if isinstance(text, str): + return self._text_to_ids(text, add_special_tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.") + + ids = get_input_ids(text) + + return self.prepare_for_model( + ids, + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def _batch_encode_plus( + self, + batch_text: Union[ + list[TextInput], + list[EncodedInput], + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + def get_input_ids(text): + if isinstance(text, str): + return self._text_to_ids(text, add_special_tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.") + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + input_ids = [] + for ids in batch_text: + input_ids.append(get_input_ids(ids)) + + batch_outputs = self._batch_prepare_for_model( + input_ids, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + ) + + return BatchEncoding(batch_outputs) + + def _all_special_ids(self) -> set[int]: + if self._tokenizer_type == MistralTokenizerType.tekken: + return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens} + elif self._tokenizer_type == MistralTokenizerType.spm: + return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens() + else: + raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}") + + def get_special_tokens_mask( + self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of ids of the sequence. + token_ids_1 (`List[int]`, *optional*): + Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if token_ids_1 is not None: + raise ValueError( + "`token_ids_1` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility." + ) + if already_has_special_tokens: + raise ValueError( + "`already_has_special_tokens` is not supported by `MistralCommonTokenizer` and should be `False`." + ) + + all_special_ids = self._all_special_ids() # cache the ids + + special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] + return special_tokens_mask + + def _batch_prepare_for_model( + self, + batch_ids: list[Union[PreTokenizedInput, list[int]]], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> BatchEncoding: + """ + Prepares a sequence of input id so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens. + + Args: + batch_ids: list of tokenized input ids + """ + + batch_outputs = {} + for ids in batch_ids: + outputs = self.prepare_for_model( + ids, + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward + return_attention_mask=False, # we pad in batch afterward + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def prepare_for_model( + self, + ids: list[int], + pair_ids: None = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + prepend_batch_axis: bool = False, + **kwargs, + ) -> BatchEncoding: + """ + Prepares a sequence of input id so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens. + + Args: + ids (`List[int]`): + Tokenized input ids of the first sequence. + pair_ids (`None`, *optional*): + Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`. + """ + if pair_ids is not None: + raise ValueError( + "`pair_ids` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility." + ) + if kwargs: + raise ValueError( + f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.prepare_for_model`." + ) + + padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + ) + + len_ids = len(ids) + + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + # Truncation: Handle max sequence length + overflowing_tokens = [] + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and len_ids > max_length: + ids, _, overflowing_tokens = self.truncate_sequences( + ids, + num_tokens_to_remove=len_ids - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = len_ids - max_length + + # Build output dictionary + encoded_inputs[self.model_input_names[0]] = ids + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, None) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(ids) + + # Padding + if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: + encoded_inputs = self.pad( + encoded_inputs, + max_length=max_length, + padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + if return_length: + encoded_inputs["length"] = len(encoded_inputs["input_ids"]) + + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis + ) + + return batch_outputs + + def _get_padding_truncation_strategies( + self, + padding: Union[str, PaddingStrategy, bool] = False, + truncation: Optional[Union[str, TruncationStrategy, bool]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + verbose: bool = True, + **kwargs, + ): + """ + Find the correct padding/truncation strategy. + """ + + # Backward compatibility for previous behavior, maybe we should deprecate it: + # If you only set max_length, it activates truncation for max_length + if max_length is not None and padding is False and truncation is None: + if verbose: + if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False): + logger.warning( + "Truncation was not explicitly activated but `max_length` is provided a specific value, please" + " use `truncation=True` to explicitly truncate examples to max length. Defaulting to" + " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" + " tokenizer you can select this strategy more precisely by providing a specific strategy to" + " `truncation`." + ) + self.deprecation_warnings["Truncation-not-explicitly-activated"] = True + truncation = "longest_first" + + # Get padding strategy + if padding is not False: + if padding is True: + if verbose: + if max_length is not None and ( + truncation is None or truncation is False or truncation == "do_not_truncate" + ): + warnings.warn( + "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " + "To pad to max length, use `padding='max_length'`." + ) + padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch + elif not isinstance(padding, PaddingStrategy): + padding_strategy = PaddingStrategy(padding) + elif isinstance(padding, PaddingStrategy): + padding_strategy = padding + else: + padding_strategy = PaddingStrategy.DO_NOT_PAD + + # Get truncation strategy + if truncation is not False and truncation is not None: + if truncation is True: + truncation_strategy = ( + TruncationStrategy.LONGEST_FIRST + ) # Default to truncate the longest sequences in pairs of inputs + elif not isinstance(truncation, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation) + elif isinstance(truncation, TruncationStrategy): + truncation_strategy = truncation + if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]: + raise ValueError( + "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonTokenizer`." + ) + else: + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + + # Set max length if needed + if max_length is None: + if padding_strategy == PaddingStrategy.MAX_LENGTH: + if self.model_max_length > LARGE_INTEGER: + if verbose: + if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False): + logger.warning( + "Asking to pad to max_length but no maximum length is provided and the model has no" + " predefined maximum length. Default to no padding." + ) + self.deprecation_warnings["Asking-to-pad-to-max_length"] = True + padding_strategy = PaddingStrategy.DO_NOT_PAD + else: + max_length = self.model_max_length + + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: + if self.model_max_length > LARGE_INTEGER: + if verbose: + if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False): + logger.warning( + "Asking to truncate to max_length but no maximum length is provided and the model has" + " no predefined maximum length. Default to no truncation." + ) + self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + else: + max_length = self.model_max_length + + # Test if we have a padding token + if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0): + raise ValueError( + "Asking to pad but the tokenizer does not have a padding token. " + "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " + "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." + ) + + # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided + if ( + truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE + and padding_strategy != PaddingStrategy.DO_NOT_PAD + and pad_to_multiple_of is not None + and max_length is not None + and (max_length % pad_to_multiple_of != 0) + ): + raise ValueError( + "Truncation and padding are both activated but " + f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." + ) + + return padding_strategy, truncation_strategy, max_length, kwargs + + def _pad( + self, + encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in `padding_side` argument: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + if needs_to_be_padded: + difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side + + if padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError(f"Invalid padding strategy:{padding_side}") + + return encoded_inputs + + def pad( + self, + encoded_inputs: Union[ + BatchEncoding, + list[BatchEncoding], + dict[str, EncodedInput], + dict[str, list[EncodedInput]], + list[dict[str, EncodedInput]], + ], + padding: Union[bool, str, PaddingStrategy] = True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + verbose: bool = True, + ) -> BatchEncoding: + """ + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + + Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, + `self.pad_token_id`). + + + If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the + result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of + PyTorch tensors, you will lose the specific device of your tensors however. + + + + Args: + encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): + Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of + tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str, + List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader + collate function. + + Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors), see + the note above for the return type. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different + lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + """ + # If we have a list of dicts, let's convert it in a dict of lists + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader + if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping): + encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} + + # The model's main input name, usually `input_ids`, has been passed for padding + if self.model_input_names[0] not in encoded_inputs: + raise ValueError( + "You should supply an encoding or a list of encodings to this method " + f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" + ) + + required_input = encoded_inputs[self.model_input_names[0]] + + if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0): + if return_attention_mask: + encoded_inputs["attention_mask"] = [] + return encoded_inputs + + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects + # and rebuild them afterwards if no return_tensors is specified + # Note that we lose the specific device the tensor may be on for PyTorch + + first_element = required_input[0] + if isinstance(first_element, (list, tuple)): + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. + for item in required_input: + if len(item) != 0: + first_element = item[0] + break + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. + if not isinstance(first_element, (int, list, tuple)): + if is_torch_tensor(first_element): + return_tensors = "pt" if return_tensors is None else return_tensors + elif isinstance(first_element, np.ndarray): + return_tensors = "np" if return_tensors is None else return_tensors + else: + raise ValueError( + f"type of {first_element} unknown: {type(first_element)}. " + "Should be one of a python, numpy, pytorch or tensorflow object." + ) + + for key, value in encoded_inputs.items(): + encoded_inputs[key] = to_py_obj(value) + + # Convert padding_strategy in PaddingStrategy + padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( + padding=padding, max_length=max_length, verbose=verbose + ) + + required_input = encoded_inputs[self.model_input_names[0]] + if required_input and not isinstance(required_input[0], (list, tuple)): + encoded_inputs = self._pad( + encoded_inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) + + batch_size = len(required_input) + assert all(len(v) == batch_size for v in encoded_inputs.values()), ( + "Some items in the output dictionary have a different batch size than others." + ) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = max(len(inputs) for inputs in required_input) + padding_strategy = PaddingStrategy.MAX_LENGTH + + batch_outputs = {} + for i in range(batch_size): + inputs = {k: v[i] for k, v in encoded_inputs.items()} + outputs = self._pad( + inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchEncoding(batch_outputs, tensor_type=return_tensors) + + def truncate_sequences( + self, + ids: list[int], + pair_ids: None = None, + num_tokens_to_remove: int = 0, + truncation_strategy: Union[str, TruncationStrategy] = "longest_first", + stride: int = 0, + **kwargs, + ) -> tuple[list[int], None, list[int]]: + """ + Truncates a sequence pair in-place following the strategy. + + Args: + ids (`List[int]`): + Tokenized input ids. Can be obtained from a string by chaining the `tokenize` and + `convert_tokens_to_ids` methods. + pair_ids (`None`, *optional*): + Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.truncate_sequences`. + num_tokens_to_remove (`int`, *optional*, defaults to 0): + Number of tokens to remove using the truncation strategy. + truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`): + The strategy to follow for truncation. Can be: + + - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater + than the model maximum admissible input size). + stride (`int`, *optional*, defaults to 0): + If set to a positive number, the overflowing tokens returned will contain some tokens from the main + sequence returned. The value of this argument defines the number of additional tokens. + + Returns: + `Tuple[List[int], None, List[int]]`: The truncated `ids` and the list of + overflowing tokens. `None` is returned to match Transformers signature. + """ + if kwargs: + raise ValueError( + f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.truncate_sequences`." + ) + if pair_ids: + raise ValueError("`pair_ids` is not supported by `MistralCommonTokenizer.truncate_sequences`.") + + if num_tokens_to_remove <= 0: + return (ids, None, []) + + if not isinstance(truncation_strategy, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation_strategy) + + if truncation_strategy in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]: + raise ValueError( + f"Only {TruncationStrategy.LONGEST_FIRST} and {TruncationStrategy.DO_NOT_TRUNCATE} are supported." + ) + + overflowing_tokens = [] + if truncation_strategy == TruncationStrategy.LONGEST_FIRST: + if len(ids) > num_tokens_to_remove: + window_len = min(len(ids), stride + num_tokens_to_remove) + if self.truncation_side == "left": + overflowing_tokens = ids[:window_len] + ids = ids[num_tokens_to_remove:] + elif self.truncation_side == "right": + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + else: + raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.") + + else: + error_msg = ( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the first sequence has a length {len(ids)}. " + ) + logger.error(error_msg) + + return (ids, None, overflowing_tokens) + + def apply_chat_template( + self, + conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]], + tools: Optional[list[Union[dict, Callable]]] = None, + continue_final_message: bool = False, + tokenize: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: bool = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_dict: bool = False, + **kwargs, + ) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]: + """ + Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token + ids. + + Args: + conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts + with "role" and "content" keys, representing the chat history so far. + tools (`List[Union[Dict, Callable]]`, *optional*): + A list of tools (callable functions) that will be accessible to the model. If the template does not + support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + giving the name, description and argument types for the tool. See our + [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + for more information. + continue_final_message (bool, *optional*): + If this is set, the chat will be formatted so that the final + message in the chat is open-ended, without any EOS tokens. The model will continue this message + rather than starting a new one. This allows you to "prefill" part of + the model's response for it. Cannot be used at the same time as `add_generation_prompt`. + tokenize (`bool`, defaults to `True`): + Whether to tokenize the output. If `False`, the output will be a string. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, defaults to `False`): + Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. + max_length (`int`, *optional*): + Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If + not specified, the tokenizer's `max_length` attribute will be used as a default. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable + values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + return_dict (`bool`, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + If at least one conversation contains an image, its pixel values will be returned in the `pixel_values` key. + kwargs (additional keyword arguments, *optional*): + Not supported by `MistralCommonTokenizer.apply_chat_template`. + Will raise an error if used. + + Returns: + `Union[str, List[int], List[str], List[List[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control + tokens. This output is ready to pass to the model, either directly or via methods like `generate()`. + """ + if kwargs: + raise ValueError( + f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.apply_chat_template`." + ) + if not isinstance(truncation, bool): + raise ValueError("`truncation` must be a boolean for `apply_chat_template` method.") + + if isinstance(conversation, (list, tuple)) and ( + isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages") + ): + conversations = conversation + is_batched = True + else: + conversations = [conversation] + is_batched = False + + def _maybe_adapt_message(message: dict[str, Any]) -> None: + """Adapt message to `mistral-common` format and leave validation to `mistral-common`.""" + if not isinstance(message, dict): + return + maybe_list_content: Optional[Union[str, list[dict[str, Union[str, dict[str, Any]]]]]] = message.get( + "content", None + ) + if not maybe_list_content or isinstance(maybe_list_content, str): + return + + normalized_content: list[dict[str, Union[str, dict[str, Any]]]] = [] + for content in maybe_list_content: + content_type = content.get("type", None) + if not content_type: + continue + elif content_type == "image": + maybe_url: Optional[str] = content.get("url") + maybe_path: Optional[str] = content.get("path") + maybe_base64: Optional[str] = content.get("base64") + if maybe_url: + image_content = maybe_url + elif maybe_path: + if not maybe_path.startswith("file://"): + maybe_path = Path(maybe_path).resolve().as_uri() + image_content = maybe_path + elif maybe_base64: + if not maybe_base64.startswith("data:image"): + maybe_base64 = "data:image/unk;base64," + maybe_base64 + image_content = maybe_base64 + else: + raise ValueError("Image content must be specified.") + normalized_content.append({"type": "image_url", "image_url": {"url": image_content}}) + else: + normalized_content.append(content) + message["content"] = normalized_content + + outputs = [] + images: list[np.ndarray] = [] + + for conversation in conversations: + messages: list[dict[str, Union[str, list[dict[str, Union[str, dict[str, Any]]]]]]] = [] + for message in conversation: + _maybe_adapt_message(message) + messages.append(message) + + chat_request = ChatCompletionRequest.from_openai( + messages=messages, + tools=tools, + continue_final_message=continue_final_message, + ) + + tokenized_request = self.tokenizer.encode_chat_completion(chat_request) + if tokenize: + outputs.append(tokenized_request.tokens) + else: + outputs.append(tokenized_request.text) + images.extend(tokenized_request.images) + + if not is_batched: + outputs = outputs[0] + + if tokenize: + out = self( + outputs, + padding=padding, + truncation=truncation, + max_length=max_length, + add_special_tokens=False, + return_tensors=return_tensors, + ) + if return_dict: + if images: + pixel_values: Union[list[np.ndarray], np.ndarray, torch.Tensor] + if return_tensors == "pt": + if not is_torch_available(): + raise ImportError( + "Unable to convert output to PyTorch tensors format, PyTorch is not installed." + ) + + pixel_values = torch.tensor(images) + elif return_tensors == "np": + pixel_values = np.array(images) + elif return_tensors is None: + pixel_values = images + else: + raise ValueError(f"Unsupported return_tensors type: {return_tensors}") + out.data["pixel_values"] = pixel_values + return out + else: + return out["input_ids"] + + else: + logger.warning( + "`MistralCommonTokenizer.apply_chat_template(..., tokenize=False)` is unsafe and may lead to unexpected behavior." + " Please consider using `tokenize=True` instead and don't encode the output manually." + ) + return outputs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, EncodedInput, list[TextInput], list[EncodedInput], None] = None, + text_pair: None = None, + text_target: None = None, + text_pair_target: None = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences. + + Args: + text (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of int + (encoded strings). + text_pair (`None`, *optional*): + Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`. + text_target (`None`, *optional*): + Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`. + text_pair_target (`None`, *optional*): + Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`. + """ + if kwargs: + raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.__call__`.") + + if text_pair or text_target or text_pair_target: + raise ValueError( + "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`." + ) + + if return_tensors in ("tf", "jax"): + raise ValueError( + "`MistralCommonTokenizer` does not support `return_tensors='tf'` or `return_tensors='jax'`." + ) + + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], (str, int)): + # ... list of strings or int + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings or with a list of ints + return len(t[0]) == 0 or isinstance(t[0][0], (str, int)) + else: + return False + else: + return False + + if not _is_valid_text_input(text): + raise ValueError( + "text input must be of type `str` (single example), `List[str]` (batch or single encoded example) " + "or `List[List[int]]` (batch of encoded examples)." + ) + + is_batched = isinstance(text, (list, tuple)) and isinstance(text[0], (str, list, tuple)) + + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + if is_batched: + return self._batch_encode_plus( + batch_text=text, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self._encode_plus( + text=text, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + *init_inputs, + mode: ValidationMode = ValidationMode.test, + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + model_max_length: int = VERY_LARGE_INTEGER, + padding_side: str = "left", + truncation_side: str = "right", + model_input_names: Optional[list[str]] = None, + clean_up_tokenization_spaces: bool = False, + **kwargs, + ): + r""" + Instantiate a `MistralCommonTokenizer` from a predefined + tokenizer. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. + - A path to a *directory* containing the tokenizer config, for instance saved + using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g., + `./my_model_directory/`. + mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`): + Validation mode for the `MistralTokenizer` tokenizer. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download the vocabulary files and override the cached versions if they + exist. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + local_files_only (`bool`, *optional*, defaults to `False`): + Whether or not to only rely on local files and not to attempt to download any files. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + padding_side (`str`, *optional*, defaults to `"left"`): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + truncation_side (`str`, *optional*, defaults to `"right"`): + The side on which the model should have truncation applied. Should be selected between ['right', 'left']. + model_input_names (`List[string]`, *optional*): + The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or + `"attention_mask"`). Default value is picked from the class attribute of the same name. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not the model should cleanup the spaces that were added when splitting the input text during the + tokenization process. + kwargs (additional keyword arguments, *optional*): + Not supported by `MistralCommonTokenizer.from_pretrained`. + Will raise an error if used. + """ + if init_inputs: + raise ValueError("`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`.") + + # Handle kwargs and AutoTokenizer case + if kwargs and not kwargs.keys() == {"_from_auto"}: + raise ValueError( + f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.from_pretrained`." + ) + + if not os.path.isfile(pretrained_model_name_or_path): + tokenizer_path = download_tokenizer_from_hf_hub( + repo_id=pretrained_model_name_or_path, + cache_dir=cache_dir, + token=token, + revision=revision, + force_download=force_download, + local_files_only=local_files_only, + ) + else: + tokenizer_path = pretrained_model_name_or_path + + return cls( + tokenizer_path=tokenizer_path, + mode=mode, + model_max_length=model_max_length, + padding_side=padding_side, + truncation_side=truncation_side, + model_input_names=model_input_names, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike, Path], + push_to_hub: bool = False, + token: Optional[Union[str, bool]] = None, + commit_message: Optional[str] = None, + repo_id: Optional[str] = None, + private: Optional[bool] = None, + repo_url: Optional[str] = None, + organization: Optional[str] = None, + **kwargs, + ) -> tuple[str]: + """ + Save the full tokenizer state. + + + This method make sure the full tokenizer can then be re-loaded using the + [`~MistralCommonTokenizer.tokenization_mistral_common.from_pretrained`] class method. + + Args: + save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + token (`str` or *bool*, *optional*, defaults to `None`): + The token to use to push to the model hub. If `True`, will use the token in the `HF_TOKEN` environment + variable. + commit_message (`str`, *optional*): The commit message to use when pushing to the hub. + repo_id (`str`, *optional*): The name of the repository to which push to the Hub. + private (`bool`, *optional*): Whether the model repository is private or not. + repo_url (`str`, *optional*): The URL to the Git repository to which push to the Hub. + organization (`str`, *optional*): The name of the organization in which you would like to push your model. + kwargs (`Dict[str, Any]`, *optional*): + Not supported by `MistralCommonTokenizer.save_pretrained`. + Will raise an error if used. + + Returns: + A tuple of `str`: The files saved. + """ + if kwargs: + raise ValueError( + f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.save_pretrained`." + ) + + save_directory = Path(save_directory) + save_directory.mkdir(parents=True, exist_ok=True) + + shutil.copy(self._tokenizer_path, save_directory) + + if push_to_hub: + repo_id = repo_id or str(save_directory).split(os.path.sep)[-1] + repo_id = self._create_repo( + repo_id, token=token, private=private, repo_url=repo_url, organization=organization + ) + files_timestamps = self._get_files_timestamps(save_directory) + + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=token, + ) + + return (str(save_directory / self._tokenizer_path.name),) diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index fe5d78f5d4..2220281cdb 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -182,6 +182,7 @@ from .import_utils import ( is_liger_kernel_available, is_lomo_available, is_matplotlib_available, + is_mistral_common_available, is_mlx_available, is_natten_available, is_ninja_available, diff --git a/src/transformers/utils/dummy_mistral_common_objects.py b/src/transformers/utils/dummy_mistral_common_objects.py new file mode 100644 index 0000000000..0c9a5a3a9d --- /dev/null +++ b/src/transformers/utils/dummy_mistral_common_objects.py @@ -0,0 +1,9 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class MistralCommonTokenizer(metaclass=DummyObject): + _backends = ["mistral-common"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["mistral-common"]) diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 5c9c6d5690..93622f6c3c 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -227,6 +227,7 @@ _spqr_available = _is_package_available("spqr_quant") _rich_available = _is_package_available("rich") _kernels_available = _is_package_available("kernels") _matplotlib_available = _is_package_available("matplotlib") +_mistral_common_available = _is_package_available("mistral_common") _torch_version = "N/A" _torch_available = False @@ -1575,6 +1576,10 @@ def is_matplotlib_available(): return _matplotlib_available +def is_mistral_common_available(): + return _mistral_common_available + + def check_torch_load_is_safe(): if not is_torch_greater_or_equal("2.6"): raise ValueError( @@ -1979,6 +1984,11 @@ RICH_IMPORT_ERROR = """ rich`. Please note that you may need to restart your runtime after installation. """ +MISTRAL_COMMON_IMPORT_ERROR = """ +{0} requires the mistral-common library but it was not found in your environment. You can install it with pip: `pip install mistral-common`. Please note that you may need to restart your runtime after installation. +""" + + BACKENDS_MAPPING = OrderedDict( [ ("av", (is_av_available, AV_IMPORT_ERROR)), @@ -2031,6 +2041,7 @@ BACKENDS_MAPPING = OrderedDict( ("pydantic", (is_pydantic_available, PYDANTIC_IMPORT_ERROR)), ("fastapi", (is_fastapi_available, FASTAPI_IMPORT_ERROR)), ("uvicorn", (is_uvicorn_available, UVICORN_IMPORT_ERROR)), + ("mistral-common", (is_mistral_common_available, MISTRAL_COMMON_IMPORT_ERROR)), ] ) diff --git a/tests/test_tokenization_mistral_common.py b/tests/test_tokenization_mistral_common.py new file mode 100644 index 0000000000..d225ec5570 --- /dev/null +++ b/tests/test_tokenization_mistral_common.py @@ -0,0 +1,1655 @@ +# Copyright 2025 Mistral AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import unittest + +import numpy as np +import torch + +from transformers.models.auto.tokenization_auto import AutoTokenizer +from transformers.testing_utils import require_mistral_common +from transformers.tokenization_mistral_common import MistralCommonTokenizer +from transformers.tokenization_utils_base import BatchEncoding, TruncationStrategy +from transformers.utils import PaddingStrategy, is_mistral_common_available + + +if is_mistral_common_available(): + from mistral_common.exceptions import InvalidMessageStructureException + from mistral_common.protocol.instruct.request import ChatCompletionRequest + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + + +IMG_URL = "https://picsum.photos/id/237/200/300" +IMG_BASE_64 = """/9j/4QDeRXhpZgAASUkqAAgAAAAGABIBAwABAAAAAQAAABoBBQABAAAAVgAAABsBBQABAAAAXgAAACgBAwABAAAAAgAAABMCAwABAAAAAQAAAGmHBAABAAAAZgAAAAAAAABIAAAAAQAAAEgAAAABAAAABwAAkAcABAAAADAyMTABkQcABAAAAAECAwCGkgcAFgAAAMAAAAAAoAcABAAAADAxMDABoAMAAQAAAP//AAACoAQAAQAAAMgAAAADoAQAAQAAACwBAAAAAAAAQVNDSUkAAABQaWNzdW0gSUQ6IDIzN//bAEMACAYGBwYFCAcHBwkJCAoMFA0MCwsMGRITDxQdGh8eHRocHCAkLicgIiwjHBwoNyksMDE0NDQfJzk9ODI8LjM0Mv/bAEMBCQkJDAsMGA0NGDIhHCEyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMv/CABEIASwAyAMBIgACEQEDEQH/xAAbAAACAwEBAQAAAAAAAAAAAAADBAACBQEGB//EABgBAQEBAQEAAAAAAAAAAAAAAAABAgME/9oADAMBAAIQAxAAAAHRMQ3DqCpzAk9FQU51SWMK6IelhFws0BAdGL9M4iHNAAkwWq3VhAEcgRf5/n9MfRgfPZZ76eDLXt1fHQ9aXxtz37fzUmX0S/nPT4329+S2BagNdDx+8+mycXU3ne3FuctszLlviecnbjOdhXs6c5bhLVgWvIV2cbkfUSfN5jfu/LYlNZtXh9Q3rUtLl0PS9saVjUr5zyTvxkuQDL9KcK0IFfWXq7lUTh6gJzpaluHTM2FSLVNXQ8zeX2k8XMaGWs6YvBWohISAVCY0cs9aJXty6bqkBt24DtoVZX4MBlC/eVJOQLeHpUvSkVeACcJQQ4woaZanVUTo0Xq6Ezy3MJB0lYWnenZSxSEgS0vVXEiB7Z7A1laMFqsKBNDKcGjJIGitwoOAMFROrBwMDBd7UJOQMTnaGcNgQzMC2ti6QulekG2chsbyta6+e0kGEqQZqCNlWPSYLYBMd6HZINGBeuDIE7oo6ItS3BGEHEfTqevUhJrOQNa5jAeUNWwoYGLpWcuXjEzQXF3caWMMj2ecGVawRQoYOO9TaNjPlhk7SYXVhas7A5ah1sG9mqzUmN+XqWnXnDrnqneWDJNigYrcIdcpVgNTTaXEvDpAscHKgwnFB/See9Rz1yEmN+R4O/o5UtaE72oQgbgKMQW43WBUNw1M3WUWldUqYVX844Ow0sYWxNIzemNeX59GwtPLmZHrLSTTVmTRxQJSdLr2hTTzXYZOt1T5h00qRYxwBBl9IHrcaxZqTOvTKPGzUTnTPKZnrPG9cHAqTealr0Gs8pAu16aLGP0dCCF7BsU5rvZ0n6es56amdJrd5Y8kKn0v5P1C2ng1D378kS9GX4OQUdey3G5dM+3eVY4um5qZPp+PWRwObSNwX4zcowKWXIquee8r9M8b0xlcZX6ZFS1YhRFNB2mtz6YWV7PMufPv7G7GPpE7jd1GbLydkSzUpPp+omyRAYwNdSvLCBfvxFW3V521I9PvYnq+PRdm981IGguqTNyigdAICFhQPGNSpRdBkHUPAFTwo38ftzMO46tcJ49Z67ye7x6FvniNIakU5c/g9VSiOxKKtCuQnNHohXSMZNzwzU9m1eMQ+gs6z839F69SXP62LNoDVGZvGimPbXEKA9CEw5rw/8QAKRAAAgIBAwMEAgMBAQAAAAAAAQIAAxEEEiEQEzEFFCJBFTIgIzAzQv/aAAgBAQABBQL+wRQcdoYGBMNLCUPc3G2zgOWFe/PM25NiCLWQWXAGAcnIPy3zeIOShmebGw0dSz44AOcKs7mIw+RqLF/iE4inEZd0VNkOIrAMRunbwe05i1Yhr47MKgQz7+MG3Acy3UIs9/pwv5GjH5KqN6pVj8sgD+poT+RqMX1OpRV6pVZC6vPiIHQTumLc0N8OoIhulmp2B/V8Sz1K130mra1iwaDCy7W3WkknrmZm6bpmA9Eusqml9SVogVgcYHAIMwRNR6jXVL73ueaTSHUFKu0m0y5+f9dJrm05qtW9Hfar+pUVjVepWaiZ6Uad72op7S8gEhoa+4P5Y/wp1FtMe97IeqJuNFlVI37h5AGJu2n/ABFZMNY2YnHUQ9Mw5Kq877rPf27h6iM06hLT0xNvUKTFonZwGsIiNlNuS1LCbdn8agst8eIeqsVMAhM3TGYQAvcxNxZiSEbk1jYM8ixsOdxhHXJE7hIJ4z1MEx02mVjJtdeieXaVjl27riuYAG2beuOuemOuJiEYiylgob5Ole5mTC/bNulNY2tmY5I5Ccuvxm3hl/gD1BgnmADsBIwHcHxncGTwg/as/HAn0U6cEbeYRHXpjp5hgE89K/8AluxGQNLP0Hl8bF+Ko2IrjG7hR8XMzxvmYzTcZkY6/WckCeYpIh8rZFYRavlt32OeFmIQUHcbcH3TGQeJXLfM7bQgjqIJ9Y58Q8zxEMB43/GJ5KlV7Tut1ZRpWeHEqlnmoZt1Fdtsetqi3npyOhMyMffbDz9Tn+r7lRwzFtuk0L6skKYylYnC4yV4lo4X4x7rG0oXKE5PQCHw0MEqHF4BlfNZ61W8adNQk9syWX7So/VeSQIx6KxWM7P1RC5E3w9VP9Vh5q4usGHEEHmnNYfU3CMGtPbgGI7CMf4440yFnBHQj4mfVXNbH5f+tSP7B56aaz4vyft92KyY3nP8UX46etk6A87o0+q25sGHWPk9PPSuzbN5MEPhRHSY/gg3HsuqVbkPQQ8gdHXevgk9BB48FXxKWzCdoZhlHXDpMAwjpR/1yJ3MkjqpyPsxDw6c9Vh6acYDWb3boHn3DNN/2qRVDLvIhXonk8HPQnIZcdCIIelH6eXSosGrmzEPEH7nyPO2yLXqD0yRMxf2dcHM+s8/eOduZgQwI00+CFpzaAmbLKAj3gxrN3VP3UqYvbNZDA5mZXje6hxsIh8Zn0OJnnMB5oxtX+t7FDSrTe5R9NbSxbMpdK5YxYxYmIKuGqQi/QUmNorRF016mo4baI6wwTwIZtlDGCfVh4O5ugWHzNIm+86eoBEZ22YHtsxKAoVVYepabs2LaDDyCnGwwARxibuMwMRFcNPMKw4EyNzN10aXIwtndjC5iEshrcwrqAbk1NiW07G7pWd2C2fFiwyCmOmJyJvabzN03GBd0q0m8Lo9hBtVXuUT3VaRSyT+yIxjNmNia4EWFN0asr0zNxg5mQOmM/xpODXqiItjsgU797byQYF2n4Gbk3TaZZp0emwGm3uBgeo461iPUYR0Zt0UDOnWolSk4g2o2Vhs+AI21sAGZQFvxGIaepaXkecTiHqBK0zNomo0+B0roLShOxEtGWsGSy4SzM/9fEBWEsckZIHcYx+U1FGxyIQP4LKkXG2hZtSWaVHmn9OXPtq1j1VALp0adhFK10ztKG7ZI7YnELBQLGyXrm+th6o2UD5DHqBmDzpRldmQtQwKgI6c9skLT25yA+XnY2uK1M2xg8w8NeZ2gFtoKhVeaulrNMPJ6BZ4n3o/Cq+3jJ3T54IYQpvOxgvzAZSxKNgXsFNpZ8cbczacgWsTvnbdzcnZ1UbwJiVAGzSjsWsPiNsNgxv4LLMfJWcx13QZUFnwL9GB7zRz3mknvtIJ7/ST8hpIPUNHPyOjnqDUWW5mcqYTxSEZ6LdJVPyGkw+t0YP5DSmDXaWe90kOu0k99pBPfaKe80YnvNKZ7fS49tpRPa6cqdLpQBoNPj2mmz7PS59poVnt9JlvT6rJbobK52rBEoseUaGnZ7XR4Gl0UbQ6Yz2elydPoodNogo0ukM9lpZ7HS5bSaVCNJpCUbFrtwkaIfk37vxAczdEc4sxEwQUUTChc4hHxrHwIw2xYEUx61E2gztqY9STtLs//8QAHREAAgICAwEAAAAAAAAAAAAAAAEREiAwAhAhUP/aAAgBAwEBPwHbYsWZZlmWwklsWmw30lukt86NK1JbERs47UQVI1cUR21oqxYPQsuSxgXHN4LLwlEonCevDwk8xgqVxjr/xAAdEQADAAIDAQEAAAAAAAAAAAAAARECEhAgMCFg/9oACAECAQE/AfXQ0RojRGiHgScrGkSGTu0aCxnGTftqjT8C36N+uXqyizNl5ZM25xfhsh/Sc4vwy7YPo2LIeXddH2jIyMjFwxpkZGRkZGUpSlNx5UpSlKU//8QAMRAAAgECBAQFBAIBBQAAAAAAAAERAiEQEiAxIjIzQQMwUWGBE3GRoSNSQARCYrHw/9oACAEBAAY/Ap2wZkLLRGHoS6i25Jc30X0IsL0LG+FiWiUoWHFo30WNsLlsOY3OxPY6lKL1lqjmO7OQ5S9LORyRU8pwtNF5JUk5TlIjG7gspE9kXpsQQc0eyLvyuGpoyeNZ+pNLlaLwRTSqqjNVh7IhbGakXnQ70mem6LuDiuyKeGnGKURsbkXTPfz3ke5xVs3x9EJUkojDby51Wxl2wtUS2LhHD17F3Bm3IRBHfDi0yRpt5ear4J7+RfysplppxsSz2WxLJt/gN9hvCC2Edicf/XEPzNxx/Y+whsY3qgicI8rufOCLYIbw98L4TjfXfGO2i3cqnlpEsPckmdezZda99DZV7vGKYOGWXUaqV7lS8Cl/S8Pmr9xOVUnezLafY7aLYyZs32ReqPux/wCnfirxP6Ve/oX0z3KPCj+JX+SdqFvovqkqWjJVsP6X8lDW6f8A2ZvFoyJbKo4ozf2XfVKN8YWEaJER6j0ZqW0S6r9jNVfyraqlgmv8BjqeqPUeF9crCdMGyFKtrzeTcsXJ0IW5GXRHl5iNMYImURmXnuBkvZdyzkujbGx3LZvIgvjJY2I9iG4PpqrhTFDmruPhwl4I9T/kXT0SvJq9TNTse7Kkq8niq0dqjiQx1Omauxxb4xW4HdnElV8H8cplrk/TcDpqwsteX1Hl+cPRnFfC+KRMotVY2/JNz2MsH1KOVnacLIsiHpXaMLs3w2xz0o4qDL4apOGtfgvWvwdRfgfEmVUVKmB0sjGdW5c2WO1Rbw5+4o8H8HF4HiJ/YfC6fgcOSZLtYbmb/a9V2ba7saKbbk+hxbFxNsbNixsVJ/sdL8jsTbHlSLshoii0exfFU1JscSREmxys2M9Pk3M9KtjJmaOSTlRLn4O+FyOwspvcu0Q0ba7iinMzhTOFQz+Sr4IkWVZjla+SZcYbk5rfciXJfMb2LJ/IlB3PDa9dewuA5TYZfYvmJEosX2LykK432OZfJepDWYVaJoT9yq199eSll3hylyRXZYuScpKgvU19jmZMlpOJM4Vc4mV0++lJ7FKpd2zc3LF2RmZmk50Xf7OFYdZM6lJ1UT9ZE/W/R1WdVnW/R9Twq5nfTx15V6lP86fuzron6tJznUR1EdQ5zqHVOsdGmS/hI6FJ0KTpUkPwaTpUnF4SOkh5eBlmqvsXof4LUn8t39y/go6aJ+ijpSdKlHS/Z03+Tl/ZDo/ZtjsjftgjbBSMasbCWVD4UcqNljYnuKxsKUKw7En/xAAmEAEAAgICAwEBAAIDAQEAAAABABEhMUFREGFxgZEgobHB8eHw/9oACAEBAAE/IV4EPV8wznMb4WQbE64n5DMWqj43c2zCCVLvdkVEL6lAtChMPJ3DMLLxMhGXGql7sMI6rUXJoi8J6NzLDPOUBfacMYWkM6IVXZqZjz1iFShUhaKq4Tw7lCmKs19hFKY8Nsd3XyblX+SzeBK95Q7LQ8Sl3WcCmXUaasNXP9S2wwptR7S1MD3LNtYgL/dwFu0sqgEAphTJg6UVZOMe/tzYK6YXZYRtC0NYRVQVWQzC0y4vmDeX1AdTYOhxLMR2hejMSwRerPEMoi/fFwjEi3/BGOzESBoggMVQaI+mIbFPcRZAiXfHh+3W6V5lNxAuutxDIYz4xHyP+Ay1I+N+HZAi+rqA1H0zgY4I1+HHPtjbM3ZzLY3BXJwihEXFDf8AhjxR5V4GPnMsNolnSzGfD5n2RDnJlgjXDCrEI5pucH9S/wDDMqan5Klc1hg6GXr1GntlnUVmD6lHMWwtxBqQ1FumDgUDO4eiIm3A2zuU5fI2YjcDOWJMaQy6kTWwnCEu+N3KItoLdYq45v4Jt8HipTPDLa6lKF5gfCWS3NPBdkG8ErVQpw1+Sx8weRDPrmVjMWWJlg4dxd7exMQuI6t3AxKA8bgnCkOTQXMrM2xqY+QYIDbGKnqgD+mCH9kvMxs3L8WmGtHbF6sQitfrW5cizF8S1kC9xG/Xg+MiamlhHuXCnDUMNQFqci6HEQ5lnVjQD3IBvHwYHEVn1HbX/wAgFji+Iqu+vCEMGmbgKOoo1cTy5i8RM1/JzPpUFmq5iCzaUjZgwCoBxDOGy6ZboQwRge9EvSWYX7g+t9xBA59yzTiUD8czI/KflKsikzXf5FvEqsS0SGHyG6ZR3G1KzmMsOLZgU27lg5hVnEhWkI72CSuRiEzL4RHaVYK9XKV2kcg3FQeAlBY41M13HiZjvxcu1PSZ4mFRiqaY7lnuOpsNxQl4qUn/AMIhSwy0OiekspVwls36jsOIIL7g1dy9pkxMbnvnyN1T6qOfJdGZnCpkaxMBsvqZqqplRb9QD0o0Oa5l0hzASezFxCanJh6qDUzzuENGoe9Q1HsIQuiXRf1KhSLXEIX0fBPQQLcxrrXaZBS9wFtglANNblOeVvC5eDucS3sFaDmKB2Z0fs57On/kYpQqPP3ifxS5gISKtXFxLUL7IOfaXjycna9S4fBCsi2RKdqxtbqK9ylNQkBSYjSdzebJUv592bnSEb1PAl3wNGv/AAjZZZ9PvNfrCf8AcaN/JkDxzCjTzFXDGM4cf4Sl1UsFMSyXgjVw7qNcSwHMsa1FW9zdgww6uoz26OfGRo6ru+5gZr+Q9G71APtlzmMuceCyjK1IblBxmC4lwUlL3mGdo8rrM78yqZuUfiKLqO4FCo8S43LIQvj/AJjbsXqOsv8AUo8R9eQl1huOg9EV1KBC28vU5YqF4cSjrwlOqsxYq88RNfiNImLmLW4YkFtufsZaj8IQK0MdxzcwfD4pTtlfBBTacwb4ipITTmbViCjdwgLnmXC08Km5RXgQNbnALhYG4AYnyJrm+5S1pIArnxOIbj7ofcQZp7ZguXOfAzheIOB1LKTZNf4PiGXLxGuoSaAyi7qouZUVxLNIubQZmhf9mgPnMqwH7GanOSmOvvEs09IWXxNF1KgnMCUSw3NMy42/YhZKyxfg3QJhvapc2i+5o07jKPE31L+yUmD+poP9Soci4nVQWA3cfLvwy5Qt/oimOkoqskMhXEKj+iH69Ri5YMy5G2AwNe2YmNq+GFnZjNwK2PqPgEpMVepdtyuRqI5oEDgdtkVUvpMZrGh6nKDuKaIasuYWqXtHbGoDXqWLvmOHMyIDyXqEDedRFzg2StDBLRNX65GVMpiCteJfsll8WvEuLJ+Qmirj3K0cxaxjboIB+1EUc8zI3qV9ENPFR1jubDcqizniIU+SyYhlBgQZVKNOo89Er6PUu2lPKzlIGHJOI8m8zfgxXkfNTGqkE1WGCldD1GAlruOVUincbH3MQ0m+B/sEtklmxnWGWX5uGQlooN6iv6GO2mXeDCghLSFtm5gr91HdV1yRGMrvGpwpyEq3JWJCENw1UXmZ3EvAkFWVIXwP9lLq5e0H7Aq29y5hlS0TKT3ZZtc//AnRj5EW9wMqPqZBkQQMdihOgwMNL24EhsaluqRl+TlUQbvtiGFnl6g67nBSmC2cRA4maCbEXfgSvAXCgYOkqGgX1DQArKkGOQ3cz8ThzNn963NSmoIUa4uGr/vGkvn2zBVq5qCLd8cJZBjmOU/srw+GK0W2cwLr/aGMPw+AsgUyDrmM1IdQvZKAh7IpBYz1OT33HZZ1qP8AztB1DmHk8tszl+oFMn7EiqXvMtycQaMpK/wLsw3oruagDUS19ie5edQq4l+ofYzJtD2ylCr1xLYQ3i0rIqruDVkIKCpmZWFO4YUeo2FAcE2gHuKwdJsdwLHF1DrBAc5j5eYkXx9jVohmmLGCc3HsyRhxvYgKlT7LMP1MwRrH2GZmi0uhYJZV0MTrOEPVWSUWmvcAUm/BHaK8qglC/Y2ro4CdCukKzTBY/wCAhIowvA3zEVY3Bl+wO4V2WhAXV/IFY/lxfok9B6ZimXpMCWvW5cRpGO5qgQU9eptHX9iFvsqUrjpqWo0YZlsIqiSyWPENLlmw7KlZVmYAtfkXseJZffqbc14o11L+yuE+QILfcbQDA7P7C2g1AUWlZnG/E4WxNYB7gBSZZzOoEqdQkNL4vdxGsxMLDAHn/QnK/wBI9b2cQNLX7ieBfRFQaMNQRcHyJ/04VFH9iRVnuahIUwDUD/JT2+glOV2G25k3/KYW2wKU9CS8pU4gxhlggg+WjNGmwhtqzIA+p/50p5SX9ko1SXsGWOcpmVtEnCJ2s6ixy7aazC+KfjMgsfsVbL9lNR9xTi+o4Nqo4Z/vjXwOof8AgQ6Bixvx3DBFsFAFjdy5WGaYfJTWi+xmLn2aKfZKEA2GjAeJfcabT7M0K+xOB+y1lHyDIWrhcVFb+xO6EzpFlUvoDjmCTAxMaU+QAMIlNPyYNr6lyH1qdWjqA2g58wF0iF1v2liSZ4mj4Q2hLd4+JguLM//aAAwDAQACAAMAAAAQ+ukG+yi+LSiaOocQMkf4WCUUq8QgoISefE8oCOCkUod+rsQwmDwAuIGegUSskyGY88g4E85x4gW8cwkwIok4IwQiUgw4oo8SdUGEG5kAY8R021JqMKgc/kkdt+ALhhikhNak8+ggsCkkGlysUsIcChUHyDMDM0Rg44rI1Ikm9Weig8SYMkcU1A3DgZojub6gWWyix774i04zXUY+QVn0rMOd7+Sa+Q8YddIZqd0ox8nlZbBRgh9s5sx//8QAHxEBAAICAwEBAQEAAAAAAAAAAQARECAhMUEwQFFh/9oACAEDAQE/EMGy1BvRHk/xoAf3BHrHHsSdS5RA+/AahFs58hHOxh1FJc7h+N5H9IXCErBHY94Gpdke9KnBkjgLi+QjkXD4Hr6DDhwBFeS18xK0MOfXC6l3Kudy/INBWgsiU4MOCLjRhKOckAqPuckOONukM9NryBETnB3KQSXCwCXFEolEolIm4AlEolEolJRP/8QAHxEAAgICAwEBAQAAAAAAAAAAAAEQESFRIDFBMGFx/9oACAECAQE/EIfzTeigNgvE0jftGfB/YrZKt0hcSGIayPO/BGR0OfwXJD4IdejcNBQxS5Q/o/q/gy6LsUP4MqxKmKHF0ZOLhS4oG7dil8FLO/NyhiGrI/yWdmDAs54Pgit0UKsqi5VL4Y9KhrcFDO4YxCH0JFwotxDLoyC+mJ8G7Y4YoemXiH0d/lUO6px0GHyqptststsTbLoT0NSi2y2y2y2y2y2z/8QAJRABAAICAwACAwEAAwEAAAAAAQARITFBUWFxgRCRobHB0fDh/9oACAEBAAE/EGBFnZLsl7VMg5itE/FalDjJDFpNCMRIJr+iKiF/krJQ5gLbjSxPKeEWkAWWzXUxEHlLldrRDPUXkfIfqOea+JlaTyLYbGIR0jheYY1wsu63qK1BjlM7g54DxCrDPcrEBzbFnFeyCCRj4bITJeE0uMBL9FwqFix1lkK4xFK89J1B/oDEAnVHLKcIsbbw1QD3HKhp+MBGQL4lcm3VRlLCvMFg2cRiSa2iHE/qofsDSKrjlAWayiBPHW5duuDXG8lJzvI6CVm2WfvNZjcXeBFovsniATYbEP40c0BFPE3ETl2QI0hyuZQlKvEKkzgMQOgcRRCvRnjfq3H4WYGebV8xeVdJktHggXYOZb0N4ARJTMqqW9y3cAC4kUY1vEvrcte2WQYuW3MXQ4YSl+AafmGEPNmY/UvBU5QBqOoYdXHHvsQgHtqqolGEVh0HxNOIrByHMEfjSAYrHZQdsSKnMTfxGjKVZPmO/wACWX+BlcxBVR4qZHEOKuyuviYl5kOYTmjRDcYMZbY2anQc1M52csWRhhBbXQRb2VmnmIw1vI8wpXJY0wOoBF3KTJqfMoiU3D+QRqKCxxPGeINNfis6I+7nEOBpQ4i1bOBYkvLrOYnVjZuAAeRQYVZLyNTc4sWYWG5U1oERU2aGMDGJd/gGtQKairhha38/hR4S4AlCcww5orXMRWagHm/khc0TyM8+Igb+kr01Knb+4yMF9LiLgACrhbeQq416KAqJcnRogUQqq2DAjK2DBLuFuAjBxUpnE7OIQgK4gu9+TcRYkqLhlUjViAaBsqAG5U3u+oqBAuuCWW3gdTXCzEFsf5FsGCs39RRbqocEswcFwi64Vr6iSrBcAt6hV8sC2m4caj9Qpwy7bQcPMMkg63DdNclwKg6XpFRuneZWAWchUILbaFgsY1nNkBLXfUVCnCYV3Hop+xMN3tfHUfzy2wEW4NwEDjqCmQjH6ljhjFpTCu2bIqH0RSqWuGAi6t1cpwylobNwWC715EVBwdT5ZYLrBiPFL8CUwS6WxtgTCCnZD/uQa0Lb0dfMvopMi6ioGtfwPxB0ZI4wefMZaN8dQIi27UOIaTrhhlWYLa2yw08QafI8iOUulFm4WMwIgG4ZE7mDkrsIYbh2sKC3ey5jJnCDtuWQoZ1UXrGJk7EquGIqdduY4HpB77qGEhWRLv6h01RKDH/lxQSkcmrlEtEwHEJYlWZb14zCAApJVEut4CMOKCszAW6taij4cwriOo1R22QxIQc25iVSGUGTRcRqB2VpJ+uaou6ADjiu4wm0srmV8KM3CBQCHQcsVS/ZDBoLubedsKTKjmpYIbdK9k30s0rnEcBpim4qxVzfN9TeCmj47i09nSYYHSyAoZ7XioSoRWUBWCpdHEyYNywtAPWAZEYkO9ZYDncohaXJHlW8UtwuQiiUQ0enwlN2lp10SinYR6PYtI9HJz/YQYpuExYyB95WWztwDArQPPMXN8ZH+1GYZ6BMsUEHtyMXGoOLpqYCQUgxiCUpeJuS3L7BcKYlMVF7lngZth6CXbZfmNwQiOoepuLAycNSUFMO2f3QYVvpw6jtjC2XMRtzbEG8n6gNVmQppKBD1axb/wDZeCw3Gry8mO6TaWBLyldDH6iQ7OGv5BTchbWALepYDm8DLZMpWYZ04qsQFAGVoIlWg0WljKrajHtQfh8Psqu4TvgioUVwy4Aj2Gb6tcQJ5lYzcVglJHEtAi+lwi8YeZlUucoaQJYmGyFVVE+FPSBuaVLK5+IvWXBSH7jqX33GnPEurhqZltQf8lymmN1iP3BLKRoKrSzx0RwnZeh4ffIwBMwPEYsxx2L1eH5mLw8uBKv9CIga6pEC0d3UGFBvXn5jThEwssVLYLbN9pyRwxqqUszWYlAANdn3iHJZYVArZXB8Q8RpWcHbAU911FqUYp4lJmIU3CyKtGrNwARAqqTDFIut/MUGF7wcwtInMjtq0vSwcRxX4ATi0XB1Hc0YOxV5ObixIPIGojVocGo8lKcDNYVLBOSmycpAO5YAgxcFVdmIZXkgEbuu5WkIzQA69NktGeEoWzuD8SpyzSkuLU1dd3d1LddhR4CtX1LNqChHI6jAAV0NVzL+QAQMyAcbzCzo3Ew0pRy/MM3I0vXxOauUU4lZS3ljoBI8rkgIAPjczhs2VMZD9kqZD9RGuP5F4IuBVrd1PM3/ALMg8lVl0kFN1sURWACy8srVdgM/L8RKNVmG3RKDQCbOHUYvjaYL9mxHJRj6iPaygK1UVkGFW1EG2pzLr0QNO4g4fZL2CvsTIPdxHJfSXpq9YiM0phLKlKDnyCPKAmTEbCp8SgMtYCO/ctUNGL39TQzd8xqoI0g6zKSRW1yY8/5EY7BLHSwQs3T7hFwQ9iUxYSt8Ssqpoept2Bhw/MpAQDyLUT/iUbTZyxLri8dTCD6I+Y0CHe42LChLwEDYZZjJi5qu4Vt5lr5EZDC2HqWOyN2OVmBzlasJkYvFYn7jLgLKag1lFMgRuI1ghouo5jmLWiFSHWquLwXlxZHZbPER9CCoHHsA1TZSahlxeiA6sOyWsr7Qs1ZTMOtzmKX7ECnc0uyKg0bWUKVbu9xlU/oRyIe9wUlvKwQmVPUYqgxSxqC1TOrota3DEN4gmKKOtcdOD51KaMXEvx1CbI5U4htXCcVMX0xzFtuFjj4DfkSiJi/xSi5jjlo4gxSDghFG0M9obiBVEZAZOa5lc5LKsPcBaKvUzPMQ3QFjSqCGh16bvyKJQ8bxBkEoz2yocRKRgBlzfEFin8zM0hhYRLADuPMQQVt5MbZo5jUQxUQQamW3uGQVi4IxqvMSIXKL3GcuUzr/ALiSrrBqeTGwGhzCWNSUqz9QAPEqGrLmGZLBK6gGggIXnCSWcIRpCjqJLMeYdXthKvzZSDTA0Y+5wmkbvNTWgeTC0r9sVBEK7gDK2HryFeVWPaFkVNALYoOyGmW+bXPMq/ZCeDGYspt/Ybg6rKTQGscStAbhW/mKAANWW/E9I8KzGx2YgtC8tRinkgqNVBsVRDREc2FQfy7IFyIhpQLU39QfawYd1oMdPQsn0EQt5o4j2Bv3FXVAlruUhbJal4IzvqUFe2m75Y7jTpeU5IemQTKi9yuJhCgcrwx45vIYTrLjmNLZ6aPacwOnCNZ/AVRazrrbyv6jAoF1EhlaO6FkWa9GoqG1xhlIy2pZEnFWLUkaICuax+4KRHuOIpgUaLyyiY1v5EQNtJGsrYypi1Kye2F0jVrcNNgA4t3nuWuK7iv4cxgXdhnlYmRdBWYPVSlwspW6CVLGRFxLLMW1sfh9vxGi1LFyi7Hxi3GMiZpk+IGanNsx8WRjbFEBynELMLRfw+I2PQ7rMbQPQZhRmFXHPQ7rcIuhxcC0ImiDdL6YEULVVCArQmdR1BWQcsuqFIuMLfc7UbtdeQCIqNBuAvtGcQTca5mUeZ0D9EJNFbXsl2nOel/UAn+mBxMKnK4xYVZFKeBHmWinBWQtvMbHsy4PjqURn2LkA3QuZYa7upYHuX/iFE3NPMIaZaix1+oLVEAET0Za3k+Y8I+wqFYN3Cg2B9IWyXoQwuGrFVANuZEZjhbgrSZRGlZ0fJCm9Qti5vbxXMrptEhGoXQGYhpl4xCKQ9NcTfkMdrpl/YOlxjOBMg0xl0XCIwvqEZ+qVGx9mbNwp4/cZVUI4oqUt3WBl1qZEEoOXM3s2BiP8QFCkHu0swssyD2kFcGCuoNR4bQSuQL231BoG1PiWWNyL/IdFXyVbhPYlCthckGApqHe7oqLTV7hVmcS5nACGRRuEuWPfUSFR07Jgsw6SJ7Ny4gClx/SOQDWVIkfui4JIYVUwMWeeEvAsOudR9BnQeMFm9YuhOzkIZgMBtRqJm0toJWBzgXSZ6I6rgKOFUMwSildztI8VMoRfw4pj2ALvlNOYBQeoGYMvGeiWW0qFAli9S/AyGyVEA1DF/8AU2ZO2YRCreTA28pd36RXUNguIdkPhg/ktOHDOYOhCQJR6s4JifOjf8iIL2rG/jENq0tCMLlrbmY9brLL3L6TN3f5wDtl3Byqn/ERt4WSo2pp4yoZqs1cvp2YJAIKQeATeO5qHHupw+JWpkouTP8A0swxPdlpVCiNqVtEt2Gwy/cc4XVKgJlRKH/ZR3LkFSwLXarr+xggxBsYroFgCmPZZmTcYqVtr8LauI6OahoSlin8mFQKTiYkRdLiW5npQEfE5A4iIFi0/wCYsQHSoEjzkekZ3DWblmDrm9n0w3acXslOkWxXPxEYOivA7lhdsFWivAitj2DZgXe/hFAxa24Cujkw+ooBVGx9QMtSsJCwdyzW5Yyxqucrdpea6m4MR+UCJjJV0RPABWJuq5o6+YARdnSJuV4plAQUyDUTDA0by/GBzgf9QcbN2jBOGFWFG6poXDQlqgVM0CAqa6l72BORddQHGPgKzCVNLLEe79QDEUbC0qv1DQ26+8w2Cq6hoIB8Sw5IJquNjHdFWMrgoKLp8QiyImk3CtvEBEVH44oNLCwxGLYM4CmXBOhpQqLxU0YDBRCg4iOx+my1CQ18KjAqeHFzeaq1mHToWwfY3AeCaBXGpMvNM5tvaZgmNgKcQYYpMXKKzqFsCOYQMhFK8bj1uhamb0KbErUp7Q9MXPqArEugatjDHekrOH4S0TF+w026Ll0mI4GDh9y4dBUxiUscWVDHjJDBBPnbskOGsCQFpWvKmM1sItw0B02HMKDHYoYu6HBQnBKxFglTu9pD8MeqlowXBJUdFpYqHoxpbcq8hra3Din4sl/Uvq5mhVFDEKRYwgrq2llKAu6tkYGpVC7ZdJx2pUYjecpjJEekINKBaabIh9VoSjX7jBRdnRcYsQaRbDKuTm+YkVVsoMR31GPJdHjpEtXrY1HvTs5TKDi8kYWoQVsN3SFhLdso5bGBmLGC14xA2ihq1ZUi2WyXnmbylnE0aViVpqsLuXkKOLhUte4nIbJmX08L3P/Z""" + + +@require_mistral_common +class TestMistralCommonTokenizer(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.tokenizer: MistralCommonTokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503", + tokenizer_type="mistral", + ) + cls.ref_tokenizer: MistralTokenizer = MistralTokenizer.from_hf_hub( + "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503" + ) + cls.fixture_conversations = [ + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + ], + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + {"role": "user", "content": "What is the temperature in Paris?"}, + ], + ] + cls.tokenized_fixture_conversations = [ + cls.ref_tokenizer.encode_chat_completion(ChatCompletionRequest.from_openai(conversation)) + for conversation in cls.fixture_conversations + ] + + cls.ref_special_ids = {t["rank"] for t in cls.ref_tokenizer.instruct_tokenizer.tokenizer._all_special_tokens} + + def _ref_piece_to_id(self, piece: str) -> int: + pieces = self.ref_tokenizer.instruct_tokenizer.tokenizer._model.encode( + piece, allowed_special="all", disallowed_special=set() + ) + assert len(pieces) == 1, f"Expected to decode 1 token, got {len(pieces)}" + return pieces[0] + + def test_vocab_size(self): + self.assertEqual(self.tokenizer.vocab_size, self.ref_tokenizer.instruct_tokenizer.tokenizer.n_words) + + def test_save_pretrained(self): + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_file = self.tokenizer.save_pretrained(tmp_dir)[0] + loaded_tokenizer = MistralCommonTokenizer.from_pretrained(tmp_file) + + self.assertIsNotNone(loaded_tokenizer) + self.assertEqual(self.tokenizer.get_vocab(), loaded_tokenizer.get_vocab()) + self.assertEqual( + self.tokenizer.tokenizer.instruct_tokenizer.tokenizer.version, + loaded_tokenizer.tokenizer.instruct_tokenizer.tokenizer.version, + ) + + with self.assertRaises( + ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.save_pretrained`." + ): + with tempfile.TemporaryDirectory() as tmp_dir: + self.tokenizer.save_pretrained(tmp_dir, unk_args="") + + def test_encode(self): + string = "Hello, world!" + + # Test 1: + # encode with add_special_tokens + expected_with_special = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string, bos=True, eos=True) + tokens_with_special = self.tokenizer.encode(string, add_special_tokens=True) + self.assertEqual(tokens_with_special, expected_with_special) + + # Test 2: + # encode without add_special_tokens + expected_without_special = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string, bos=False, eos=False) + tokens_without_special = self.tokenizer.encode(string, add_special_tokens=False) + self.assertEqual(tokens_without_special, expected_without_special) + + # Test 3: + # encode with return_tensors + tokens_with_return_tensors = self.tokenizer.encode(string, add_special_tokens=False, return_tensors="pt") + self.assertIsInstance(tokens_with_return_tensors, torch.Tensor) + self.assertEqual(tokens_with_return_tensors.tolist()[0], expected_without_special) + + # Test 4: + # encode with max_length + tokens_with_max_length = self.tokenizer.encode(string, add_special_tokens=False, max_length=3) + self.assertEqual(tokens_with_max_length, expected_without_special[:3]) + + # Test 5: + # encode with padding + tokens_with_padding = self.tokenizer.encode( + string, add_special_tokens=False, padding=True, pad_to_multiple_of=6 + ) + expected_padding = [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * ( + 6 - len(expected_without_special) % 6 + ) + expected_without_special + self.assertEqual(tokens_with_padding, expected_padding) + + for padding in [ + False, + True, + "longest", + "max_length", + "do_not_pad", + PaddingStrategy.LONGEST, + PaddingStrategy.MAX_LENGTH, + PaddingStrategy.DO_NOT_PAD, + ]: + tokens_with_padding = self.tokenizer.encode(string, add_special_tokens=False, padding=padding) + self.assertEqual(tokens_with_padding, expected_without_special) + + # For truncation, we use a longer string + string_long = ( + "Hello world! It is a beautiful day today. The sun is shining brightly and the birds are singing." + ) + expected_long = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string_long, bos=False, eos=False) + + # Test 6: + # encode with truncation + tokens_with_truncation = self.tokenizer.encode( + string_long, add_special_tokens=False, truncation=True, max_length=12 + ) + self.assertEqual(tokens_with_truncation, expected_long[:12]) + + # Test 7: + # encode with padding and truncation + tokens_with_padding_and_truncation = self.tokenizer.encode( + string_long, add_special_tokens=False, padding=True, pad_to_multiple_of=12, truncation=True, max_length=36 + ) + expected_long_padding = [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * ( + 12 - len(expected_long) % 12 + ) + expected_long + self.assertEqual(tokens_with_padding_and_truncation, expected_long_padding) + + # Test encode with unsupported kwargs + with self.assertRaises( + ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.encode`." + ): + self.tokenizer.encode("Hello, world!", add_special_tokens=True, unk_args="") + + def test_decode(self): + string = "Hello, world!" + string_with_space = "Hello, world !" + + tokens_ids = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string, bos=True, eos=True) + tokens_ids_with_space = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode( + string_with_space, bos=True, eos=True + ) + + # Test 1: + # decode with and without skip_special_tokens + self.assertEqual(self.tokenizer.decode(tokens_ids, skip_special_tokens=True), string) + self.assertEqual(self.tokenizer.decode(tokens_ids, skip_special_tokens=False), "" + string + "") + self.assertEqual(self.tokenizer.decode(tokens_ids_with_space, skip_special_tokens=True), string_with_space) + + # Test 2: + # decode with clean_up_tokenization_spaces + self.assertEqual( + self.tokenizer.decode(tokens_ids_with_space, skip_special_tokens=True, clean_up_tokenization_spaces=True), + "Hello, world!", + ) + + # Test 3: + # decode with unsupported kwargs + with self.assertRaises( + ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.decode`." + ): + self.tokenizer.decode(tokens_ids, skip_special_tokens=False, unk_args="") + + def test_batch_decode(self): + string = "Hello, world!" + string_with_space = "Hello, world !" + + batch_tokens_ids = [ + self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string, bos=True, eos=True), + self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string_with_space, bos=True, eos=True), + ] + + # Test 1: + # batch_decode with and without skip_special_tokens + self.assertEqual( + self.tokenizer.batch_decode(batch_tokens_ids, skip_special_tokens=True), + [string, string_with_space], + ) + self.assertEqual( + self.tokenizer.batch_decode(batch_tokens_ids, skip_special_tokens=False), + ["" + string + "", "" + string_with_space + ""], + ) + self.assertEqual( + self.tokenizer.batch_decode(batch_tokens_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True), + ["Hello, world!", "Hello, world!"], + ) + + # Test 2: + # batch_decode with unsupported kwargs + with self.assertRaises( + ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.batch_decode`." + ): + self.tokenizer.batch_decode(batch_tokens_ids, skip_special_tokens=False, unk_args="") + + def test_convert_ids_to_tokens(self): + # Test 1: + # with skip_special_tokens=False + ids = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode("Hello world!", bos=True, eos=True) + expected_tokens = [self.ref_tokenizer.instruct_tokenizer.tokenizer.id_to_piece(id) for id in ids] + + tokens = self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=False) + self.assertEqual(tokens, expected_tokens) + + token = self.tokenizer.convert_ids_to_tokens(ids[0], skip_special_tokens=False) + self.assertEqual(token, expected_tokens[0]) + + # Test 2: + # with skip_special_tokens=True + expected_tokens = expected_tokens[1:-1] + tokens = self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True) + self.assertEqual(tokens, expected_tokens) + + with self.assertRaises(ValueError): + self.tokenizer.convert_ids_to_tokens(ids[0], skip_special_tokens=True) + token = self.tokenizer.convert_ids_to_tokens(ids[1], skip_special_tokens=True) + self.assertEqual(token, expected_tokens[0]) + + def test_convert_tokens_to_ids(self): + tokens = ["Hello", "world", "!"] + expected_ids = [self._ref_piece_to_id(token) for token in tokens] + # Test 1: + # list of tokens + ids = self.tokenizer.convert_tokens_to_ids(tokens) + self.assertEqual(ids, expected_ids) + + # Test 2: + # single token + id = self.tokenizer.convert_tokens_to_ids(tokens[0]) + self.assertEqual(id, expected_ids[0]) + self.assertEqual(id, self.tokenizer.convert_tokens_to_ids(tokens[0])) + + def test_tokenize(self): + string = "Hello world!" + expected_tokens = [ + self.ref_tokenizer.instruct_tokenizer.tokenizer.id_to_piece(id) + for id in self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(string, bos=False, eos=False) + ] + tokens = self.tokenizer.tokenize(string) + self.assertEqual(tokens, expected_tokens) + + with self.assertRaises( + ValueError, msg="Kwargs [add_special_tokens] are not supported by `MistralCommonTokenizer.tokenize`." + ): + self.tokenizer.tokenize(string, add_special_tokens=True) + + def test_get_special_tokens_mask(self): + # Test 1: + # with skip_special_tokens=False + ids = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode("Hello world!", bos=True, eos=True) + expected_mask = [1 if id in self.ref_special_ids else 0 for id in ids] + + mask = self.tokenizer.get_special_tokens_mask(ids) + self.assertEqual(mask, expected_mask) + + # Test 2: + # already_has_special_tokens=True should raise an error + with self.assertRaises(ValueError): + self.tokenizer.get_special_tokens_mask(ids, already_has_special_tokens=True) + + # Test 3: + # token_ids_1 not None should raise an error + with self.assertRaises(ValueError): + self.tokenizer.get_special_tokens_mask(ids, token_ids_1=ids) + + def test_pad_batch_encoding_input(self): + # Test 1: + # padding and default values + + def get_batch_encoding(): + return self.tokenizer("Hello world!", return_special_tokens_mask=True) + + batch_encoding = get_batch_encoding() + + for padding in [ + False, + True, + "longest", + "max_length", + "do_not_pad", + PaddingStrategy.LONGEST, + PaddingStrategy.MAX_LENGTH, + PaddingStrategy.DO_NOT_PAD, + ]: + padded_batch_encoding = self.tokenizer.pad(get_batch_encoding(), padding=padding) + self.assertEqual(padded_batch_encoding, batch_encoding) + + # Test 2: + # padding_strategy="max_length" or PaddingStrategy.MAX_LENGTH and max_length + for padding in ["max_length", PaddingStrategy.MAX_LENGTH]: + padded_batch_encoding = self.tokenizer.pad(get_batch_encoding(), padding=padding, max_length=12) + self.assertEqual( + padded_batch_encoding["input_ids"], + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * (12 - len(batch_encoding["input_ids"])) + + batch_encoding["input_ids"], + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + [0] * (12 - len(batch_encoding["input_ids"])) + batch_encoding["attention_mask"], + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [1] * (12 - len(batch_encoding["input_ids"])) + batch_encoding["special_tokens_mask"], + ) + + # Test 3: + # padding_strategy=True or "longest" or PaddingStrategy.LONGEST or "max_length" or PaddingStrategy.MAX_LENGTH and pad_to_multiple_of 16 + for padding in [True, "longest", PaddingStrategy.LONGEST]: + padded_batch_encoding = self.tokenizer.pad(get_batch_encoding(), padding=padding, pad_to_multiple_of=16) + self.assertEqual( + padded_batch_encoding["input_ids"], + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * (16 - len(batch_encoding["input_ids"])) + + batch_encoding["input_ids"], + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + [0] * (16 - len(batch_encoding["input_ids"])) + batch_encoding["attention_mask"], + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [1] * (16 - len(batch_encoding["input_ids"])) + batch_encoding["special_tokens_mask"], + ) + + # Test 4: + # padding_side="right" + right_tokenizer = MistralCommonTokenizer.from_pretrained( + "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503", + padding_side="right", + ) + right_paddings = [ + right_tokenizer.pad(get_batch_encoding(), padding="max_length", max_length=12), + self.tokenizer.pad(get_batch_encoding(), padding="max_length", max_length=12, padding_side="right"), + ] + for padded_batch_encoding in right_paddings: + self.assertEqual( + padded_batch_encoding["input_ids"], + batch_encoding["input_ids"] + + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * (12 - len(batch_encoding["input_ids"])), + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + batch_encoding["attention_mask"] + [0] * (12 - len(batch_encoding["input_ids"])), + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + batch_encoding["special_tokens_mask"] + [1] * (12 - len(batch_encoding["input_ids"])), + ) + + # Test 5: + # return_attention_mask=False + padded_batch_encoding = self.tokenizer.pad( + get_batch_encoding(), padding="max_length", max_length=12, return_attention_mask=False + ) + self.assertEqual( + padded_batch_encoding["input_ids"], + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * (12 - len(batch_encoding["input_ids"])) + + batch_encoding["input_ids"], + ) + self.assertEqual(padded_batch_encoding["attention_mask"], batch_encoding["attention_mask"]) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [1] * (12 - len(batch_encoding["input_ids"])) + batch_encoding["special_tokens_mask"], + ) + + # Test 6: + # return_tensors="pt" or "np" + for return_tensors in ["pt", "np"]: + padded_batch_encoding = self.tokenizer.pad( + get_batch_encoding(), padding="max_length", max_length=12, return_tensors=return_tensors + ) + self.assertEqual(padded_batch_encoding["input_ids"].shape, torch.Size((12,))) + self.assertEqual(padded_batch_encoding["attention_mask"].shape, torch.Size((12,))) + self.assertEqual(padded_batch_encoding["special_tokens_mask"].shape, torch.Size((12,))) + + def test_list_batch_encoding_input(self): + def get_batch_encoding(): + return self.tokenizer(["Hello world!", "Hello world! Longer sentence."], return_special_tokens_mask=True) + + # Test 1: + # padding=True or "longest" or PaddingStrategy.LONGEST + batch_encoding = get_batch_encoding() + for padding in [ + True, + "longest", + PaddingStrategy.LONGEST, + ]: + padded_batch_encoding = self.tokenizer.pad(get_batch_encoding(), padding=padding) + self.assertEqual( + padded_batch_encoding["input_ids"], + [ + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (len(batch_encoding["input_ids"][1]) - len(batch_encoding["input_ids"][0])) + + batch_encoding["input_ids"][0], + batch_encoding["input_ids"][1], + ], + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + [ + [0] * (len(batch_encoding["input_ids"][1]) - len(batch_encoding["input_ids"][0])) + + batch_encoding["attention_mask"][0], + batch_encoding["attention_mask"][1], + ], + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [ + [1] * (len(batch_encoding["input_ids"][1]) - len(batch_encoding["input_ids"][0])) + + batch_encoding["special_tokens_mask"][0], + batch_encoding["special_tokens_mask"][1], + ], + ) + + # Test 2: + # padding_strategy="max_length" or PaddingStrategy.MAX_LENGTH and max_length + for padding in ["max_length", PaddingStrategy.MAX_LENGTH]: + padded_batch_encoding = self.tokenizer.pad(get_batch_encoding(), padding=padding, max_length=12) + self.assertEqual( + padded_batch_encoding["input_ids"], + [ + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (12 - len(batch_encoding["input_ids"][0])) + + batch_encoding["input_ids"][0], + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (12 - len(batch_encoding["input_ids"][1])) + + batch_encoding["input_ids"][1], + ], + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + [ + [0] * (12 - len(batch_encoding["input_ids"][0])) + batch_encoding["attention_mask"][0], + [0] * (12 - len(batch_encoding["input_ids"][1])) + batch_encoding["attention_mask"][1], + ], + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [ + [1] * (12 - len(batch_encoding["input_ids"][0])) + batch_encoding["special_tokens_mask"][0], + [1] * (12 - len(batch_encoding["input_ids"][1])) + batch_encoding["special_tokens_mask"][1], + ], + ) + + # Test 3: + # padding_strategy=True or "longest" or PaddingStrategy.LONGEST or "max_length" or PaddingStrategy.MAX_LENGTH and pad_to_multiple_of 16 + for padding in [True, "longest", PaddingStrategy.LONGEST]: + padded_batch_encoding = self.tokenizer.pad(get_batch_encoding(), padding=padding, pad_to_multiple_of=16) + self.assertEqual( + padded_batch_encoding["input_ids"], + [ + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (16 - len(batch_encoding["input_ids"][0])) + + batch_encoding["input_ids"][0], + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (16 - len(batch_encoding["input_ids"][1])) + + batch_encoding["input_ids"][1], + ], + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + [ + [0] * (16 - len(batch_encoding["input_ids"][0])) + batch_encoding["attention_mask"][0], + [0] * (16 - len(batch_encoding["input_ids"][1])) + batch_encoding["attention_mask"][1], + ], + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [ + [1] * (16 - len(batch_encoding["input_ids"][0])) + batch_encoding["special_tokens_mask"][0], + [1] * (16 - len(batch_encoding["input_ids"][1])) + batch_encoding["special_tokens_mask"][1], + ], + ) + + # Test 4: + # padding_side="right" + right_tokenizer = MistralCommonTokenizer.from_pretrained( + "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503", + padding_side="right", + ) + right_paddings = [ + right_tokenizer.pad(get_batch_encoding(), padding="max_length", max_length=12), + self.tokenizer.pad(get_batch_encoding(), padding="max_length", max_length=12, padding_side="right"), + ] + for padded_batch_encoding in right_paddings: + self.assertEqual( + padded_batch_encoding["input_ids"], + [ + batch_encoding["input_ids"][0] + + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (12 - len(batch_encoding["input_ids"][0])), + batch_encoding["input_ids"][1] + + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + * (12 - len(batch_encoding["input_ids"][1])), + ], + ) + self.assertEqual( + padded_batch_encoding["attention_mask"], + [ + batch_encoding["attention_mask"][0] + [0] * (12 - len(batch_encoding["input_ids"][0])), + batch_encoding["attention_mask"][1] + [0] * (12 - len(batch_encoding["input_ids"][1])), + ], + ) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [ + batch_encoding["special_tokens_mask"][0] + [1] * (12 - len(batch_encoding["input_ids"][0])), + batch_encoding["special_tokens_mask"][1] + [1] * (12 - len(batch_encoding["input_ids"][1])), + ], + ) + + # Test 5: + # return_attention_mask=False + padded_batch_encoding = self.tokenizer.pad( + get_batch_encoding(), padding="max_length", max_length=12, return_attention_mask=False + ) + self.assertEqual( + padded_batch_encoding["input_ids"], + [ + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * (12 - len(batch_encoding["input_ids"][0])) + + batch_encoding["input_ids"][0], + [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] * (12 - len(batch_encoding["input_ids"][1])) + + batch_encoding["input_ids"][1], + ], + ) + self.assertEqual(padded_batch_encoding["attention_mask"], batch_encoding["attention_mask"]) + self.assertEqual( + padded_batch_encoding["special_tokens_mask"], + [ + [1] * (12 - len(batch_encoding["input_ids"][0])) + batch_encoding["special_tokens_mask"][0], + [1] * (12 - len(batch_encoding["input_ids"][1])) + batch_encoding["special_tokens_mask"][1], + ], + ) + + # Test 6: + # return_tensors="pt" or "np" + for return_tensors in ["pt", "np"]: + padded_batch_encoding = self.tokenizer.pad( + get_batch_encoding(), padding="max_length", max_length=12, return_tensors=return_tensors + ) + self.assertEqual(padded_batch_encoding["input_ids"].shape, torch.Size((2, 12))) + self.assertEqual(padded_batch_encoding["attention_mask"].shape, torch.Size((2, 12))) + self.assertEqual(padded_batch_encoding["special_tokens_mask"].shape, torch.Size((2, 12))) + + def test_truncate_sequences(self): + # Test 1: + # truncation_strategy="longest_first" or TruncationStrategy.LONGEST_FIRST + text = "Hello world!" + ids = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=True, eos=True) + for truncation in ["longest_first", TruncationStrategy.LONGEST_FIRST]: + for num_tokens_to_remove in [0, 2]: + tokens, none, overflowing_tokens = self.tokenizer.truncate_sequences( + ids, truncation_strategy=truncation, num_tokens_to_remove=num_tokens_to_remove + ) + self.assertEqual(tokens, ids[:-num_tokens_to_remove] if num_tokens_to_remove > 0 else ids) + self.assertIsNone(none) + self.assertEqual(overflowing_tokens, ids[-num_tokens_to_remove:] if num_tokens_to_remove > 0 else []) + + # Test 2: + # truncation_strategy="only_first" or "only_second" or TruncationStrategy.ONLY_FIRST or TruncationStrategy.ONLY_SECOND + # Should raise a ValueError + for truncation in ["only_first", "only_second", TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]: + with self.assertRaises(ValueError): + self.tokenizer.truncate_sequences(ids, truncation_strategy=truncation, num_tokens_to_remove=1) + + # Test 3: + # truncation_strategy="do_not_truncate" or TruncationStrategy.DO_NOT_TRUNCATE + for truncation in ["do_not_truncate", TruncationStrategy.DO_NOT_TRUNCATE]: + tokens, none, overflowing_tokens = self.tokenizer.truncate_sequences( + ids, truncation_strategy=truncation, num_tokens_to_remove=1 + ) + self.assertEqual(tokens, ids) + self.assertIsNone(none) + self.assertEqual(overflowing_tokens, []) + + # Test 4: + # pair_ids is not None + # Should raise a ValueError + with self.assertRaises(ValueError): + self.tokenizer.truncate_sequences( + ids, pair_ids=ids, truncation_strategy="longest_first", num_tokens_to_remove=1 + ) + + # Test 5: + # stride + for stride in [0, 2]: + tokens, none, overflowing_tokens = self.tokenizer.truncate_sequences( + ids, truncation_strategy="longest_first", num_tokens_to_remove=2, stride=stride + ) + self.assertEqual(tokens, ids[:-2]) + self.assertIsNone(none) + self.assertEqual(overflowing_tokens, ids[-2 - stride :]) + + # Test 6: + # truncation_side="left" + left_tokenizer = MistralCommonTokenizer.from_pretrained( + "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503", + truncation_side="left", + ) + tokens, none, overflowing_tokens = left_tokenizer.truncate_sequences( + ids, truncation_strategy="longest_first", num_tokens_to_remove=2 + ) + self.assertEqual(tokens, ids[2:]) + self.assertIsNone(none) + self.assertEqual(overflowing_tokens, ids[:2]) + + def test_apply_chat_template_basic(self): + conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + {"role": "user", "content": "What is the capital of France?"}, + ] + + expected_tokenized = self.ref_tokenizer.encode_chat_completion(ChatCompletionRequest.from_openai(conversation)) + + # Test 1: + # with tokenize + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tokenize=False), + expected_tokenized.text, + ) + + # Test 2: + # without tokenize + self.assertEqual(self.tokenizer.apply_chat_template(conversation, tokenize=True), expected_tokenized.tokens) + + with self.assertRaises( + ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.apply_chat_template`." + ): + self.tokenizer.apply_chat_template(conversation, tokenize=True, unk_args="") + + def test_apply_chat_template_continue_final_message(self): + conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "Paris"}, + ] + + expected_tokenized = self.ref_tokenizer.encode_chat_completion( + ChatCompletionRequest.from_openai(conversation, continue_final_message=True) + ) + + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tokenize=False, continue_final_message=True), + expected_tokenized.text, + ) + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tokenize=True, continue_final_message=True), + expected_tokenized.tokens, + ) + + with self.assertRaises(InvalidMessageStructureException): + self.tokenizer.apply_chat_template(conversation, tokenize=False, continue_final_message=False) + + def test_apply_chat_template_with_tools(self): + conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + {"role": "user", "content": "What is the temperature in Paris?"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "azerty123", + "function": { + "name": "get_current_weather", + "arguments": {"location": "Paris", "format": "text", "unit": "celsius"}, + }, + } + ], + }, + {"role": "tool", "name": "get_current_weather", "content": "22", "tool_call_id": "azerty123"}, + ] + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + "required": ["location"], + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + "format": { + "type": "string", + "enum": ["text", "json"], + "description": "The format of the response", + "required": ["format"], + }, + }, + }, + }, + } + ] + + expected_tokenized = self.ref_tokenizer.encode_chat_completion( + ChatCompletionRequest.from_openai(conversation, tools) + ) + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tools=tools, tokenize=False), + expected_tokenized.text, + ) + + def test_apply_chat_template_with_image(self): + ref_conversation = conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image_url", + "image_url": {"url": IMG_URL}, + }, + ], + }, + ] + + expected_tokenized = self.ref_tokenizer.encode_chat_completion( + ChatCompletionRequest.from_openai(ref_conversation) + ) + image_contents = [ + { + "type": "image_url", + "image_url": {"url": IMG_URL}, + }, + { + "type": "image", + "url": IMG_URL, + }, + {"type": "image", "base64": IMG_BASE_64}, + ] + for image_content in image_contents: + conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [{"type": "text", "text": "What is this?"}, image_content], + }, + ] + + output = self.tokenizer.apply_chat_template(conversation, tokenize=True) + self.assertEqual(output, expected_tokenized.tokens) + + output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True, return_dict=True) + self.assertEqual(output_dict["input_ids"], expected_tokenized.tokens) + self.assertEqual(len(output_dict["pixel_values"]), len(expected_tokenized.images)) + for o, e in zip(output_dict["pixel_values"], expected_tokenized.images): + self.assertTrue(np.allclose(o, e)) + + output_dict = self.tokenizer.apply_chat_template( + conversation, tokenize=True, return_dict=True, return_tensors="pt" + ) + self.assertEqual(output_dict["input_ids"].tolist()[0], expected_tokenized.tokens) + self.assertTrue(torch.allclose(output_dict["pixel_values"], torch.tensor(expected_tokenized.images))) + + def test_appsly_chat_template_with_truncation(self): + conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + {"role": "user", "content": "What is the capital of France?"}, + ] + + expected_tokenized = self.ref_tokenizer.encode_chat_completion(ChatCompletionRequest.from_openai(conversation)) + + # Test 1: + # with truncation + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=True, max_length=20), + expected_tokenized.tokens[:20], + ) + + # Test 2: + # without truncation + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=False, max_length=20), + expected_tokenized.tokens, + ) + + # Test 3: + # assert truncation is boolean + with self.assertRaises(ValueError): + self.tokenizer.apply_chat_template( + conversation, tokenize=True, truncation=TruncationStrategy.LONGEST_FIRST, max_length=20 + ) + + def test_batch_apply_chat_template(self): + conversations = [ + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image_url", + "image_url": {"url": IMG_URL}, + }, + ], + }, + ], + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you?"}, + {"role": "user", "content": "What is the temperature in Paris?"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "azerty123", + "function": { + "name": "get_current_weather", + "arguments": {"location": "Paris", "format": "text", "unit": "celsius"}, + }, + } + ], + }, + {"role": "tool", "name": "get_current_weather", "content": "22", "tool_call_id": "azerty123"}, + ], + ] + + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + "required": ["location"], + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + "format": { + "type": "string", + "enum": ["text", "json"], + "description": "The format of the response", + "required": ["format"], + }, + }, + }, + }, + } + ] + + expected_tokenized = [ + self.ref_tokenizer.encode_chat_completion(ChatCompletionRequest.from_openai(conversation, tools=tools)) + for conversation in conversations + ] + + text_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=False) + token_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=True) + + self.assertEqual(len(text_outputs), len(token_outputs)) + self.assertEqual(len(text_outputs), len(expected_tokenized)) + for text, token, expected in zip(text_outputs, token_outputs, expected_tokenized): + self.assertEqual(text, expected.text) + self.assertEqual(token, expected.tokens) + + with self.assertRaises( + ValueError, + msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.batch_apply_chat_template`.", + ): + self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=True, unk_args="") + + def test_batch_apply_images(self): + conversations = [ + [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image_url", + "image_url": {"url": IMG_URL}, + }, + ], + }, + ], + [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image", + "url": IMG_URL, + }, + ], + }, + ], + [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + {"type": "image", "base64": IMG_BASE_64}, + ], + }, + ], + ] + + ref_conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image_url", + "image_url": {"url": IMG_URL}, + }, + ], + }, + ] + + expected_tokenized = self.ref_tokenizer.encode_chat_completion( + ChatCompletionRequest.from_openai(ref_conversation) + ) + + output = self.tokenizer.apply_chat_template(conversations, tokenize=True) + self.assertEqual(output, [expected_tokenized.tokens] * 3) + + output = self.tokenizer.apply_chat_template(conversations, tokenize=True, return_dict=True) + self.assertEqual(output["input_ids"], [expected_tokenized.tokens] * 3) + self.assertEqual(len(output["pixel_values"]), len(expected_tokenized.images) * 3) + for o, e in zip(output["pixel_values"], [expected_tokenized.images] * 3): + self.assertTrue(np.allclose(o, e)) + + output = self.tokenizer.apply_chat_template( + conversations, tokenize=True, return_dict=True, return_tensors="pt" + ) + self.assertEqual(output["input_ids"].tolist(), [expected_tokenized.tokens] * 3) + self.assertEqual(output["input_ids"].shape[0], len(expected_tokenized.images) * 3) + self.assertTrue(torch.allclose(output["pixel_values"], torch.tensor([expected_tokenized.images] * 3))) + + output = self.tokenizer.apply_chat_template( + conversations, tokenize=True, return_dict=True, return_tensors="np" + ) + self.assertEqual(output["input_ids"].tolist(), [expected_tokenized.tokens] * 3) + self.assertTrue(np.allclose(output["pixel_values"], np.array([expected_tokenized.images] * 3))) + + def test_batch_apply_chat_template_with_continue_final_message(self): + conversations = [ + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can "}, + ], + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hello! How can I help you? Ou préférez vous "}, + ], + ] + + # Test 1: + # with continue_final_message + expected_tokenized = [ + self.ref_tokenizer.encode_chat_completion( + ChatCompletionRequest.from_openai(conversation, continue_final_message=True) + ) + for conversation in conversations + ] + + token_outputs = self.tokenizer.apply_chat_template(conversations, tokenize=True, continue_final_message=True) + + for output, expected in zip(token_outputs, expected_tokenized): + self.assertEqual(output, expected.tokens) + + # Test 2: + # without continue_final_message + with self.assertRaises(InvalidMessageStructureException): + self.tokenizer.apply_chat_template( + conversations, + tokenize=False, + continue_final_message=False, + ) + + # Test 3: + # with continue_final_message and last role is not assistant + with self.assertRaises(InvalidMessageStructureException): + self.tokenizer.apply_chat_template( + conversation=[ + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi!"}, + ] + ], + tokenize=True, + continue_final_message=True, + ) + + def test_batch_apply_chat_template_with_truncation( + self, + ): + # Test 1: + # with truncation + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, truncation=True, max_length=20 + ) + + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual(output, expected.tokens[:20]) + + # Test 2: + # without truncation + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, truncation=False, max_length=20 + ) + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual(output, expected.tokens) + + # Test 3: + # assert truncation is boolean + with self.assertRaises(ValueError): + self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, truncation=TruncationStrategy.LONGEST_FIRST, max_length=20 + ) + + def test_batch_apply_chat_template_with_padding( + self, + ): + for padding in [True, "max_length", PaddingStrategy.LONGEST, PaddingStrategy.MAX_LENGTH]: + if padding == PaddingStrategy.MAX_LENGTH: + # No padding if no max length is provided + token_outputs = self.tokenizer.apply_chat_template(self.fixture_conversations, padding=padding) + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual(output, expected.tokens) + + max_length = 20 if padding == PaddingStrategy.MAX_LENGTH else None + + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, padding=padding, max_length=max_length + ) + + if padding != PaddingStrategy.MAX_LENGTH: + longest = max(len(tokenized.tokens) for tokenized in self.tokenized_fixture_conversations) + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual( + output, + [self.tokenizer.pad_token_id] * (longest - len(expected.tokens)) + expected.tokens, + ) + else: + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + if len(expected.tokens) < max_length: + self.assertEqual( + output, + [self.tokenizer.pad_token_id] * (20 - len(expected.tokens)) + expected.tokens, + ) + else: + self.assertEqual(output, expected.tokens) + + for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]: + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, padding=padding + ) + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual(output, expected.tokens) + + def test_batch_apply_chat_template_with_padding_and_truncation( + self, + ): + max_length = 20 + for padding in [True, "max_length", PaddingStrategy.LONGEST, PaddingStrategy.MAX_LENGTH]: + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, truncation=True, padding=padding, max_length=max_length + ) + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual( + output, [self.tokenizer.pad_token_id] * (20 - len(expected.tokens)) + expected.tokens[:20] + ) + for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]: + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, truncation=True, padding=padding, max_length=max_length + ) + self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) + for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): + self.assertEqual(output, expected.tokens[:20]) + + def test_batch_apply_chat_template_return_tensors(self): + # Test 1: + # with tokenize + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=True, return_tensors="pt", padding=True + ) + self.assertIsInstance(token_outputs, torch.Tensor) + self.assertEqual( + token_outputs.shape, + (len(self.fixture_conversations), max(len(t.tokens) for t in self.tokenized_fixture_conversations)), + ) + + # Test 2: + # without tokenize, should ignore return_tensors + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=False, return_tensors="pt", padding=True + ) + self.assertEqual(token_outputs, [t.text for t in self.tokenized_fixture_conversations]) + + def test_batch_apply_chat_template_return_dict(self): + # Test 1: + # with tokenize + token_outputs = self.tokenizer.apply_chat_template(self.fixture_conversations, tokenize=True, return_dict=True) + self.assertIn("input_ids", token_outputs) + self.assertIn("attention_mask", token_outputs) + self.assertEqual(token_outputs["input_ids"], [t.tokens for t in self.tokenized_fixture_conversations]) + self.assertEqual( + token_outputs["attention_mask"], [[1] * len(t.tokens) for t in self.tokenized_fixture_conversations] + ) + + # Test 2: + # without tokenize, should ignore return_dict + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, tokenize=False, return_dict=True + ) + self.assertNotIsInstance(token_outputs, dict) + self.assertEqual(token_outputs, [t.text for t in self.tokenized_fixture_conversations]) + + def test_call(self): + # Test 1: + # default case + text = "Hello world!" + expected_tokens = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=True, eos=True) + tokens = self.tokenizer(text) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [1] * len(expected_tokens)) + + # Test 2: + # return_attention_mask=False + tokens = self.tokenizer(text, return_attention_mask=False) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertNotIn("attention_mask", tokens) + + # Test 3: + # return_tensors="pt" + tokens = self.tokenizer(text, return_tensors="pt") + self.assertIsInstance(tokens["input_ids"], torch.Tensor) + self.assertTrue(torch.equal(tokens["input_ids"], torch.Tensor(expected_tokens).unsqueeze(0))) + self.assertIsInstance(tokens["attention_mask"], torch.Tensor) + self.assertTrue(torch.equal(tokens["attention_mask"], torch.ones(1, len(expected_tokens)))) + + # Test 4: + # return_special_tokens_mask=True + tokens = self.tokenizer(text, return_special_tokens_mask=True) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [1] * len(expected_tokens)) + self.assertEqual(tokens["special_tokens_mask"], [1] + [0] * (len(expected_tokens) - 2) + [1]) + + # Test 5: + # add_special_tokens=False + expected_tokens = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=False, eos=False) + tokens = self.tokenizer(text, add_special_tokens=False, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [1] * len(expected_tokens)) + self.assertEqual(tokens["special_tokens_mask"], [0] * len(expected_tokens)) + + with self.assertRaises( + ValueError, msg="Kwargs [wrong_kwarg] are not supported by `MistralCommonTokenizer.__call__`." + ): + self.tokenizer(text, wrong_kwarg=True) + + with self.assertRaises( + ValueError, + msg="`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`.", + ): + self.tokenizer(text, text_pair="Hello world!") + with self.assertRaises( + ValueError, + msg="`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`.", + ): + self.tokenizer(text, text_target="Hello world!") + with self.assertRaises( + ValueError, + msg="`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`.", + ): + self.tokenizer(text, text_pair_target="Hello world!") + + def test_call_with_truncation(self): + # Test 1: + # truncation=True or "longest_first" or TruncationStrategy.LONGEST_FIRST + text = "Hello world!" * 10 + expected_tokens = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=True, eos=True) + + for truncation in [True, "longest_first", TruncationStrategy.LONGEST_FIRST]: + tokens = self.tokenizer(text, truncation=True, max_length=10, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens[:10]) + self.assertEqual(tokens["attention_mask"], [1] * 10) + self.assertEqual(tokens["special_tokens_mask"], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + + # Test 2: + # truncation=False + for truncation in [False, "do_not_truncate", TruncationStrategy.DO_NOT_TRUNCATE]: + tokens = self.tokenizer(text, truncation=truncation, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [1] * len(expected_tokens)) + self.assertEqual(tokens["special_tokens_mask"], [1] + [0] * (len(expected_tokens) - 2) + [1]) + + # Test 3: + # truncation=True or "longest_first" or TruncationStrategy.LONGEST_FIRST with return_overflowing_tokens=True and stride + for truncation in [True, "longest_first", TruncationStrategy.LONGEST_FIRST]: + for stride in [0, 2]: + tokens = self.tokenizer( + text, + truncation=truncation, + max_length=10, + return_overflowing_tokens=True, + return_special_tokens_mask=True, + stride=stride, + ) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens[:10]) + self.assertEqual(tokens["attention_mask"], [1] * 10) + self.assertEqual(tokens["special_tokens_mask"], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(tokens["overflowing_tokens"], expected_tokens[10 - stride :]) + self.assertEqual(tokens["num_truncated_tokens"], len(expected_tokens) - 10) + + # Test 4: + # truncation="only_first" or TruncationStrategy.ONLY_FIRST or "only_second" or TruncationStrategy.ONLY_SECOND + # should raise an error + for truncation in ["only_first", TruncationStrategy.ONLY_FIRST, "only_second", TruncationStrategy.ONLY_SECOND]: + with self.assertRaises( + ValueError, + msg="Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonTokenizer`.", + ): + self.tokenizer(text, truncation=truncation) + + def test_call_with_padding(self): + text = "Hello world!" + expected_tokens = self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=True, eos=True) + + # Test 1: + # padding=False or padding=True or "do_not_pad" or PaddingStrategy.DO_NOT_PAD or padding="longest" or PaddingStrategy.LONGEST + for padding in [False, True, "do_not_pad", PaddingStrategy.DO_NOT_PAD, "longest", PaddingStrategy.LONGEST]: + tokens = self.tokenizer(text, padding=padding, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [1] * len(expected_tokens)) + self.assertEqual(tokens["special_tokens_mask"], [1] + [0] * (len(expected_tokens) - 2) + [1]) + + # Test 2: + # padding="max_length" or PaddingStrategy.MAX_LENGTH + for padding in ["max_length", PaddingStrategy.MAX_LENGTH]: + tokens = self.tokenizer(text, padding=padding, max_length=20, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = 20 - len(expected_tokens) + self.assertEqual(tokens["input_ids"], num_padding * [self.tokenizer.pad_token_id] + expected_tokens) + self.assertEqual(tokens["attention_mask"], num_padding * [0] + [1] * len(expected_tokens)) + self.assertEqual( + tokens["special_tokens_mask"], num_padding * [1] + [1] + [0] * (len(expected_tokens) - 2) + [1] + ) + + # Test 3: + # pad_to_multiple_of + tokens = self.tokenizer( + text, padding=True, max_length=20, pad_to_multiple_of=16, return_special_tokens_mask=True + ) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = 16 - len(expected_tokens) + self.assertEqual(tokens["input_ids"], num_padding * [self.tokenizer.pad_token_id] + expected_tokens) + self.assertEqual(tokens["attention_mask"], num_padding * [0] + [1] * len(expected_tokens)) + self.assertEqual( + tokens["special_tokens_mask"], num_padding * [1] + [1] + [0] * (len(expected_tokens) - 2) + [1] + ) + + # Test 4: + # padding="max_length" and padding_side="right" + tokens = self.tokenizer( + text, padding="max_length", max_length=20, padding_side="right", return_special_tokens_mask=True + ) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = 20 - len(expected_tokens) + self.assertEqual(tokens["input_ids"], expected_tokens + num_padding * [self.tokenizer.pad_token_id]) + self.assertEqual(tokens["attention_mask"], [1] * len(expected_tokens) + num_padding * [0]) + self.assertEqual( + tokens["special_tokens_mask"], [1] + [0] * (len(expected_tokens) - 2) + [1] + num_padding * [1] + ) + + def test_batch_call(self): + # Test 1: + # default case + text = ["Hello world!", "Hello world! Longer"] + expected_tokens = [self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(t, bos=True, eos=True) for t in text] + tokens = self.tokenizer(text) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [[1] * len(t) for t in expected_tokens]) + + # Test 2: + # return_attention_mask=False + tokens = self.tokenizer(text, return_attention_mask=False) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertNotIn("attention_mask", tokens) + + # Test 3: + # return_tensors="pt" + tokens = self.tokenizer(text, return_tensors="pt", padding="longest", return_special_tokens_mask=True) + self.assertIsInstance(tokens["input_ids"], torch.Tensor) + self.assertEqual(tokens["input_ids"].shape, torch.Size([2, len(expected_tokens[1])])) + self.assertTrue( + torch.equal( + tokens["input_ids"][0], + torch.Tensor( + (len(expected_tokens[1]) - len(expected_tokens[0])) + * [self.ref_tokenizer.instruct_tokenizer.tokenizer.pad_id] + + expected_tokens[0] + ), + ) + ) + self.assertIsInstance(tokens["attention_mask"], torch.Tensor) + self.assertEqual(tokens["attention_mask"].shape, torch.Size([2, len(expected_tokens[1])])) + self.assertTrue( + torch.equal( + tokens["attention_mask"][0], + torch.Tensor( + [0] * (len(expected_tokens[1]) - len(expected_tokens[0])) + [1] * len(expected_tokens[0]) + ), + ) + ) + self.assertTrue(torch.equal(tokens["attention_mask"][1], torch.Tensor([1] * len(expected_tokens[1])))) + self.assertIsInstance(tokens["special_tokens_mask"], torch.Tensor) + self.assertEqual(tokens["special_tokens_mask"].shape, torch.Size([2, len(expected_tokens[1])])) + self.assertTrue( + torch.equal( + tokens["special_tokens_mask"][0], + torch.Tensor( + (len(expected_tokens[1]) - len(expected_tokens[0])) * [1] + + [1] + + [0] * (len(expected_tokens[0]) - 2) + + [1] + ), + ) + ) + self.assertTrue( + torch.equal( + tokens["special_tokens_mask"][1], torch.Tensor([1] + [0] * (len(expected_tokens[1]) - 2) + [1]) + ) + ) + + # Test 4: + # add_special_tokens=False + expected_tokens = [ + self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(t, bos=False, eos=False) for t in text + ] + tokens = self.tokenizer(text, add_special_tokens=False, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [[1] * len(t) for t in expected_tokens]) + self.assertEqual(tokens["special_tokens_mask"], [[0] * len(t) for t in expected_tokens]) + + def test_batch_call_with_truncation(self): + # Test 1: + # truncation=True + text = ["Hello world!", "Hello world! Longer" * 10] + expected_tokens = [self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(t, bos=True, eos=True) for t in text] + + for truncation in [True, "longest_first", TruncationStrategy.LONGEST_FIRST]: + tokens = self.tokenizer(text, truncation=True, max_length=10, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], [expected_tokens[0][:10], expected_tokens[1][:10]]) + self.assertEqual(tokens["attention_mask"], [[1] * min(len(t), 10) for t in expected_tokens]) + self.assertEqual( + tokens["special_tokens_mask"], + [[1 if id in self.ref_special_ids else 0 for id in ids[:10]] for ids in expected_tokens], + ) + + # Test 2: + # truncation=False + for truncation in [False, "do_not_truncate", TruncationStrategy.DO_NOT_TRUNCATE]: + tokens = self.tokenizer(text, truncation=truncation, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [[1] * len(t) for t in expected_tokens]) + self.assertEqual( + tokens["special_tokens_mask"], + [[1] + [0] * (len(t) - 2) + [1] for t in expected_tokens], + ) + + # Test 3: + # truncation=True or "longest_first" or TruncationStrategy.LONGEST_FIRST with return_overflowing_tokens=True and stride + + for truncation in [True, "longest_first", TruncationStrategy.LONGEST_FIRST]: + for stride in [0, 2]: + tokens = self.tokenizer( + text, + truncation=truncation, + max_length=10, + return_overflowing_tokens=True, + return_special_tokens_mask=True, + stride=stride, + ) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], [expected_tokens[0][:10], expected_tokens[1][:10]]) + self.assertEqual(tokens["attention_mask"], [[1] * min(len(t), 10) for t in expected_tokens]) + self.assertEqual( + tokens["overflowing_tokens"], + [expected_tokens[0][10 - stride :], expected_tokens[1][10 - stride :]], + ) + self.assertEqual( + tokens["num_truncated_tokens"], [len(expected_tokens[0]) - 10, len(expected_tokens[1]) - 10] + ) + self.assertEqual( + tokens["special_tokens_mask"], + [[1 if id in self.ref_special_ids else 0 for id in ids[:10]] for ids in expected_tokens], + ) + + def test_batch_call_with_padding(self): + # Test 1: + # padding=False or padding=True or "do_not_pad" or PaddingStrategy.DO_NOT_PAD or padding="longest" or PaddingStrategy.LONGEST + text = ["Hello world!", "Hello world! Longer"] + expected_tokens = [self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(t, bos=True, eos=True) for t in text] + for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]: + tokens = self.tokenizer(text, padding=padding, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual(tokens["input_ids"], expected_tokens) + self.assertEqual(tokens["attention_mask"], [[1] * len(t) for t in expected_tokens]) + self.assertEqual( + tokens["special_tokens_mask"], + [[1] + [0] * (len(t) - 2) + [1] for t in expected_tokens], + ) + + # Test 2: + # padding="max_length" or PaddingStrategy.MAX_LENGTH + for padding in ["max_length", PaddingStrategy.MAX_LENGTH]: + tokens = self.tokenizer(text, padding=padding, max_length=20, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = [20 - len(t) for t in expected_tokens] + self.assertEqual( + tokens["input_ids"], + [ + num_padding[0] * [self.tokenizer.pad_token_id] + expected_tokens[0], + num_padding[1] * [self.tokenizer.pad_token_id] + expected_tokens[1], + ], + ) + self.assertEqual( + tokens["attention_mask"], + [ + num_padding[0] * [0] + [1] * len(expected_tokens[0]), + num_padding[1] * [0] + [1] * len(expected_tokens[1]), + ], + ) + self.assertEqual( + tokens["special_tokens_mask"], + [ + num_padding[0] * [1] + [1] + [0] * (len(expected_tokens[0]) - 2) + [1], + num_padding[1] * [1] + [1] + [0] * (len(expected_tokens[1]) - 2) + [1], + ], + ) + + # Test 3: + # padding=True or "longest" or PaddingStrategy.LONGEST + for padding in [True, "longest", PaddingStrategy.LONGEST]: + tokens = self.tokenizer(text, padding=padding, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = [len(expected_tokens[1]) - len(t) for t in expected_tokens] + self.assertEqual( + tokens["input_ids"], + [ + num_padding[0] * [self.tokenizer.pad_token_id] + expected_tokens[0], + num_padding[1] * [self.tokenizer.pad_token_id] + expected_tokens[1], + ], + ) + self.assertEqual( + tokens["attention_mask"], + [ + num_padding[0] * [0] + [1] * len(expected_tokens[0]), + num_padding[1] * [0] + [1] * len(expected_tokens[1]), + ], + ) + self.assertEqual( + tokens["special_tokens_mask"], + [ + num_padding[0] * [1] + [1] + [0] * (len(expected_tokens[0]) - 2) + [1], + num_padding[1] * [1] + [1] + [0] * (len(expected_tokens[1]) - 2) + [1], + ], + ) + + # Test 4: + # pad_to_multiple_of + tokens = self.tokenizer( + text, padding=True, max_length=32, pad_to_multiple_of=16, return_special_tokens_mask=True + ) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = [16 - len(t) for t in expected_tokens] + self.assertEqual( + tokens["input_ids"], + [ + num_padding[0] * [self.tokenizer.pad_token_id] + expected_tokens[0], + num_padding[1] * [self.tokenizer.pad_token_id] + expected_tokens[1], + ], + ) + self.assertEqual( + tokens["attention_mask"], + [ + num_padding[0] * [0] + [1] * len(expected_tokens[0]), + num_padding[1] * [0] + [1] * len(expected_tokens[1]), + ], + ) + self.assertEqual( + tokens["special_tokens_mask"], + [ + num_padding[0] * [1] + [1] + [0] * (len(expected_tokens[0]) - 2) + [1], + num_padding[1] * [1] + [1] + [0] * (len(expected_tokens[1]) - 2) + [1], + ], + ) + + # Test 5: + # padding="max_length" or PaddingStrategy.MAX_LENGTH and padding_side="right" + for padding in ["max_length", PaddingStrategy.MAX_LENGTH]: + tokens = self.tokenizer( + text, padding=padding, max_length=20, padding_side="right", return_special_tokens_mask=True + ) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = [20 - len(t) for t in expected_tokens] + self.assertEqual( + tokens["input_ids"], + [ + expected_tokens[0] + num_padding[0] * [self.tokenizer.pad_token_id], + expected_tokens[1] + num_padding[1] * [self.tokenizer.pad_token_id], + ], + ) + self.assertEqual( + tokens["attention_mask"], + [ + [1] * len(expected_tokens[0]) + num_padding[0] * [0], + [1] * len(expected_tokens[1]) + num_padding[1] * [0], + ], + ) + self.assertEqual( + tokens["special_tokens_mask"], + [ + [1] + [0] * (len(expected_tokens[0]) - 2) + [1] + num_padding[0] * [1], + [1] + [0] * (len(expected_tokens[1]) - 2) + [1] + num_padding[1] * [1], + ], + ) + + def test_batch_call_with_padding_and_truncation(self): + # Test 1: + # padding=True or "longest" or PaddingStrategy.LONGEST or "max_length" or PaddingStragy.MAX_LENGTH + # and truncation=True or "longest_first" or TruncationStrategy.LONGEST_FIRST + # and max_length + text = ["Hello world!", "Hello world! Longer" * 10] + expected_tokens = [self.ref_tokenizer.instruct_tokenizer.tokenizer.encode(t, bos=True, eos=True) for t in text] + for padding in [True, "longest", PaddingStrategy.LONGEST, "max_length", PaddingStrategy.MAX_LENGTH]: + for truncation in [True, "longest_first", TruncationStrategy.LONGEST_FIRST]: + tokens = self.tokenizer( + text, padding=padding, truncation=truncation, max_length=10, return_special_tokens_mask=True + ) + num_padding = [max(0, 10 - len(t)) for t in expected_tokens] + self.assertIsInstance(tokens, BatchEncoding) + self.assertEqual( + tokens["input_ids"], + [num_padding[i] * [self.tokenizer.pad_token_id] + t[:10] for i, t in enumerate(expected_tokens)], + ) + self.assertEqual( + tokens["attention_mask"], + [num_padding[i] * [0] + [1] * min(len(t), 10) for i, t in enumerate(expected_tokens)], + ) + self.assertEqual( + tokens["special_tokens_mask"], + [ + num_padding[i] * [1] + [1 if id in self.ref_special_ids else 0 for id in ids[:10]] + for i, ids in enumerate(expected_tokens) + ], + ) + + # Test 2: + # padding=True or "longest" or PaddingStrategy.LONGEST and truncation=True or "longest_first" or TruncationStrategy.LONGEST_FIRST + # and no max_length + for padding in ["longest", PaddingStrategy.LONGEST]: + for truncation in [True, "longest_first", TruncationStrategy.LONGEST_FIRST]: + tokens = self.tokenizer(text, padding=padding, truncation=truncation, return_special_tokens_mask=True) + self.assertIsInstance(tokens, BatchEncoding) + num_padding = [max(len(t) for t in expected_tokens) - len(t) for t in expected_tokens] + self.assertEqual( + tokens["input_ids"], + [num_padding[i] * [self.tokenizer.pad_token_id] + t for i, t in enumerate(expected_tokens)], + ) + self.assertEqual( + tokens["attention_mask"], + [num_padding[i] * [0] + [1] * len(t) for i, t in enumerate(expected_tokens)], + ) + self.assertEqual( + tokens["special_tokens_mask"], + [ + num_padding[i] * [1] + [1 if id in self.ref_special_ids else 0 for id in ids] + for i, ids in enumerate(expected_tokens) + ], + ) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 8e1e520b62..e5e166a3b8 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -1164,7 +1164,7 @@ def parse_commit_message(commit_message: str) -> dict[str, bool]: JOB_TO_TEST_FILE = { "tests_torch": r"tests/models/.*/test_modeling_(?!(?:flax_|tf_)).*", "tests_generate": r"tests/models/.*/test_modeling_(?!(?:flax_|tf_)).*", - "tests_tokenization": r"tests/models/.*/test_tokenization.*", + "tests_tokenization": r"tests/(?:models/.*/test_tokenization.*|test_tokenization_mistral_common\.py)", "tests_processors": r"tests/models/.*/test_(?!(?:modeling_|tokenization_)).*", # takes feature extractors, image processors, processors "examples_torch": r"examples/pytorch/.*test_.*", "tests_exotic_models": r"tests/models/.*(?=layoutlmv|nat|deta|udop|nougat).*",