From 5f789a687aa6d610ad64fdc39104bc196a5bfcb9 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 3 Nov 2021 08:59:44 +0100 Subject: [PATCH] Add LayoutXLMProcessor (and LayoutXLMTokenizer, LayoutXLMTokenizerFast) (#14115) * Add LayoutXLMTokenizer and LayoutXLMTokenizerFast * Fix styling issues * Fix more styling issues * Fix more styling issues * Fix docstring * Fix unit tests * Fix docs * Fix unit tests * Fix typos and styling issues * Fix styling issues * Fix docstring * Make all tests of test_tokenization_layoutxlm pass * Add LayoutXLMProcessor * Make fixup * Make all LayoutXLMProcessor tests pass * Minor fixes * Leave LayoutLMv2Processor tests unchanged * Fix code quality * Move LayoutXLM tokenizers and processor to separate folder * Fix code quality * Apply suggestions from code review * Replace assertions by value errors * Remove methods from fast tokenizer Co-authored-by: King Yiu Suen --- docs/source/model_doc/layoutxlm.rst | 36 +- src/transformers/__init__.py | 8 + src/transformers/convert_slow_tokenizer.py | 1 + src/transformers/models/__init__.py | 1 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/layoutxlm/__init__.py | 54 + .../models/layoutxlm/processing_layoutxlm.py | 207 ++ .../layoutxlm/tokenization_layoutxlm.py | 1061 ++++++++++ .../layoutxlm/tokenization_layoutxlm_fast.py | 694 +++++++ .../utils/dummy_sentencepiece_objects.py | 9 + .../utils/dummy_tokenizers_objects.py | 9 + .../utils/dummy_vision_objects.py | 9 + tests/test_processor_layoutxlm.py | 423 ++++ tests/test_tokenization_layoutxlm.py | 1785 +++++++++++++++++ 14 files changed, 4294 insertions(+), 4 deletions(-) create mode 100644 src/transformers/models/layoutxlm/__init__.py create mode 100644 src/transformers/models/layoutxlm/processing_layoutxlm.py create mode 100644 src/transformers/models/layoutxlm/tokenization_layoutxlm.py create mode 100644 src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py create mode 100644 tests/test_processor_layoutxlm.py create mode 100644 tests/test_tokenization_layoutxlm.py diff --git a/docs/source/model_doc/layoutxlm.rst b/docs/source/model_doc/layoutxlm.rst index 2635eb4cb0..22f659af00 100644 --- a/docs/source/model_doc/layoutxlm.rst +++ b/docs/source/model_doc/layoutxlm.rst @@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') -Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can -initialize it as follows: +Note that LayoutXLM has its own tokenizer, based on +:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as +follows: .. code-block:: - from transformers import AutoTokenizer + from transformers import LayoutXLMTokenizer - tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') + tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base') + +Similar to LayoutLMv2, you can use :class:`~transformers.LayoutXLMProcessor` (which internally applies +:class:`~transformers.LayoutLMv2FeatureExtractor` and +:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast` in sequence) to prepare all +data for the model. As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page ` for all tips, code examples and notebooks. This model was contributed by `nielsr `__. The original code can be found `here `__. + + +LayoutXLMTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.LayoutXLMTokenizer + :members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary + + +LayoutXLMTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.LayoutXLMTokenizerFast + :members: __call__ + + +LayoutXLMProcessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.LayoutXLMProcessor + :members: __call__ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6f969c4e71..16e4642288 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -229,6 +229,7 @@ _import_structure = { "LayoutLMv2Processor", "LayoutLMv2Tokenizer", ], + "models.layoutxlm": ["LayoutXLMProcessor"], "models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"], "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"], "models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"], @@ -365,6 +366,7 @@ if is_sentencepiece_available(): _import_structure["models.big_bird"].append("BigBirdTokenizer") _import_structure["models.camembert"].append("CamembertTokenizer") _import_structure["models.deberta_v2"].append("DebertaV2Tokenizer") + _import_structure["models.layoutxlm"].append("LayoutXLMTokenizer") _import_structure["models.m2m_100"].append("M2M100Tokenizer") _import_structure["models.marian"].append("MarianTokenizer") _import_structure["models.mbart"].append("MBartTokenizer") @@ -411,6 +413,7 @@ if is_tokenizers_available(): _import_structure["models.herbert"].append("HerbertTokenizerFast") _import_structure["models.layoutlm"].append("LayoutLMTokenizerFast") _import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast") + _import_structure["models.layoutxlm"].append("LayoutXLMTokenizerFast") _import_structure["models.led"].append("LEDTokenizerFast") _import_structure["models.longformer"].append("LongformerTokenizerFast") _import_structure["models.lxmert"].append("LxmertTokenizerFast") @@ -477,6 +480,7 @@ if is_vision_available(): _import_structure["models.detr"].append("DetrFeatureExtractor") _import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor") _import_structure["models.layoutlmv2"].append("LayoutLMv2Processor") + _import_structure["models.layoutxlm"].append("LayoutXLMProcessor") _import_structure["models.segformer"].append("SegformerFeatureExtractor") _import_structure["models.vit"].append("ViTFeatureExtractor") else: @@ -2140,6 +2144,7 @@ if TYPE_CHECKING: LayoutLMv2Processor, LayoutLMv2Tokenizer, ) + from .models.layoutxlm import LayoutXLMProcessor from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer @@ -2266,6 +2271,7 @@ if TYPE_CHECKING: from .models.big_bird import BigBirdTokenizer from .models.camembert import CamembertTokenizer from .models.deberta_v2 import DebertaV2Tokenizer + from .models.layoutxlm import LayoutXLMTokenizer from .models.m2m_100 import M2M100Tokenizer from .models.marian import MarianTokenizer from .models.mbart import MBart50Tokenizer, MBartTokenizer @@ -2302,6 +2308,7 @@ if TYPE_CHECKING: from .models.herbert import HerbertTokenizerFast from .models.layoutlm import LayoutLMTokenizerFast from .models.layoutlmv2 import LayoutLMv2TokenizerFast + from .models.layoutxlm import LayoutXLMTokenizerFast from .models.led import LEDTokenizerFast from .models.longformer import LongformerTokenizerFast from .models.lxmert import LxmertTokenizerFast @@ -2349,6 +2356,7 @@ if TYPE_CHECKING: from .models.deit import DeiTFeatureExtractor from .models.detr import DetrFeatureExtractor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor + from .models.layoutxlm import LayoutXLMProcessor from .models.segformer import SegformerFeatureExtractor from .models.vit import ViTFeatureExtractor else: diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index a66015c260..767887c442 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -944,6 +944,7 @@ SLOW_TO_FAST_CONVERTERS = { "HerbertTokenizer": HerbertConverter, "LayoutLMTokenizer": BertConverter, "LayoutLMv2Tokenizer": BertConverter, + "LayoutXLMTokenizer": XLMRobertaConverter, "LongformerTokenizer": RobertaConverter, "LEDTokenizer": RobertaConverter, "LxmertTokenizer": BertConverter, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 6ff1557c8d..e166cd2e70 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -59,6 +59,7 @@ from . import ( ibert, layoutlm, layoutlmv2, + layoutxlm, led, longformer, luke, diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index c55206592c..1084464eb6 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -124,6 +124,7 @@ else: ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), + ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)), ( "dpr", ( diff --git a/src/transformers/models/layoutxlm/__init__.py b/src/transformers/models/layoutxlm/__init__.py new file mode 100644 index 0000000000..3bfb2bf58b --- /dev/null +++ b/src/transformers/models/layoutxlm/__init__.py @@ -0,0 +1,54 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...file_utils import ( + _LazyModule, + is_sentencepiece_available, + is_tokenizers_available, + is_torch_available, + is_vision_available, +) + + +_import_structure = {} + +if is_sentencepiece_available(): + _import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"] + +if is_tokenizers_available(): + _import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"] + +if is_vision_available(): + _import_structure["processing_layoutxlm"] = ["LayoutXLMProcessor"] + +if TYPE_CHECKING: + if is_sentencepiece_available(): + from .tokenization_layoutxlm import LayoutXLMTokenizer + + if is_tokenizers_available(): + from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast + + if is_vision_available(): + from .processing_layoutlmv2 import LayoutXLMProcessor + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py new file mode 100644 index 0000000000..7178797fbf --- /dev/null +++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py @@ -0,0 +1,207 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for LayoutXLM. +""" +from typing import List, Optional, Union + +from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor + +from ...file_utils import TensorType +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from .tokenization_layoutxlm import LayoutXLMTokenizer +from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast + + +class LayoutXLMProcessor: + r""" + Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a + single processor. + + :class:`~transformers.LayoutXLMProcessor` offers all the functionalities you need to prepare data for the model. + + It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and + optionally applies OCR to get words and normalized bounding boxes. These are then provided to + :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`, which turns the words + and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`. + Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token + classification tasks (such as FUNSD, CORD). + + Args: + feature_extractor (:obj:`LayoutLMv2FeatureExtractor`): + An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required + input. + tokenizer (:obj:`LayoutXLMTokenizer` or :obj:`LayoutXLMTokenizerFast`): + An instance of :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`. + The tokenizer is a required input. + """ + + def __init__(self, feature_extractor, tokenizer): + if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor): + raise ValueError( + f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}" + ) + if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)): + raise ValueError( + f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}" + ) + + self.feature_extractor = feature_extractor + self.tokenizer = tokenizer + + def save_pretrained(self, save_directory): + """ + Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory ``save_directory``, + so that it can be re-loaded using the :func:`~transformers.LayoutXLMProcessor.from_pretrained` class method. + + .. note:: + + This class method is simply calling + :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the + docstrings of the methods above for more information. + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): + Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will + be created if it does not exist). + """ + + self.feature_extractor.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs): + r""" + Instantiate a :class:`~transformers.LayoutXLMProcessor` from a pretrained LayoutXLM processor. + + .. note:: + + This class method is simply calling Layoutv2FeatureExtractor's + :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and + LayoutXLMTokenizerFast's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. + Please refer to the docstrings of the methods above for more information. + + Args: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + This can be either: + + - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a feature extractor file saved using the + :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g., + ``./my_model_directory/``. + - a path or url to a saved feature extractor JSON `file`, e.g., + ``./my_model_directory/preprocessor_config.json``. + + use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to instantiate a fast tokenizer. + + **kwargs + Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and + :class:`~transformers.PreTrainedTokenizer` + """ + feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + + return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) + + def __call__( + self, + images, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None, + boxes: Union[List[List[int]], List[List[List[int]]]] = None, + word_labels: Optional[Union[List[int], List[List[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs + ) -> BatchEncoding: + """ + This method first forwards the :obj:`images` argument to + :meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was + initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with + the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the output, together + with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr` + set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user + along with the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the + output, together with resized :obj:`images`. + + Please refer to the docstring of the above two methods for more information. + """ + # verify input + if self.feature_extractor.apply_ocr and (boxes is not None): + raise ValueError( + "You cannot provide bounding boxes " + "if you initialized the feature extractor with apply_ocr set to True." + ) + + if self.feature_extractor.apply_ocr and (word_labels is not None): + raise ValueError( + "You cannot provide word labels " + "if you initialized the feature extractor with apply_ocr set to True." + ) + + # first, apply the feature extractor + features = self.feature_extractor(images=images, return_tensors=return_tensors) + + # second, apply the tokenizer + if text is not None and self.feature_extractor.apply_ocr and text_pair is None: + if isinstance(text, str): + text = [text] # add batch dimension (as the feature extractor always adds a batch dimension) + text_pair = features["words"] + + encoded_inputs = self.tokenizer( + text=text if text is not None else features["words"], + text_pair=text_pair if text_pair is not None else None, + boxes=boxes if boxes is not None else features["boxes"], + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + + # add pixel values + encoded_inputs["image"] = features.pop("pixel_values") + + return encoded_inputs diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py new file mode 100644 index 0000000000..e71e8eef2c --- /dev/null +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py @@ -0,0 +1,1061 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +""" Tokenization classes for LayoutXLM model.""" + + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple, Union + +import sentencepiece as spm +from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING + +from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_utils_base import ( + ENCODE_KWARGS_DOCSTRING, + BatchEncoding, + EncodedInput, + PreTokenizedInput, + TextInput, + TextInputPair, + TruncationStrategy, +) +from ...utils import logging +from ..xlm_roberta.tokenization_xlm_roberta import ( + PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, + PRETRAINED_VOCAB_FILES_MAP, + SPIECE_UNDERLINE, + VOCAB_FILES_NAMES, +) + + +logger = logging.get_logger(__name__) + + +class LayoutXLMTokenizer(PreTrainedTokenizer): + """ + Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on + `SentencePiece `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`): + The bounding box to use for the special [CLS] token. + sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`): + The bounding box to use for the special [SEP] token. + pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`): + The bounding box to use for the special [PAD] token. + pad_token_label (:obj:`int`, `optional`, defaults to -100): + The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's + CrossEntropyLoss. + only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to only label the first subword, in case word labels are provided. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + cls_token_box=[0, 0, 0, 0], + sep_token_box=[1000, 1000, 1000, 1000], + pad_token_box=[0, 0, 0, 0], + pad_token_label=-100, + only_label_first_subword=True, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # Mimic fairseq token-to-id alignment for the first 4 token + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + + # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + # additional properties + self.cls_token_box = cls_token_box + self.sep_token_box = sep_token_box + self.pad_token_box = pad_token_box + self.pad_token_label = pad_token_label + self.only_label_first_subword = only_label_first_subword + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM-RoBERTa sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.sp_model) + self.fairseq_offset + 1 # Add the token + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None, + boxes: Union[List[List[int]], List[List[List[int]]]] = None, + word_labels: Optional[Union[List[int], List[List[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences with word-level normalized bounding boxes and optional labels. + + Args: + text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings + (words of a single example or questions of a batch of examples) or a list of list of strings (batch of + words). + text_pair (:obj:`List[str]`, :obj:`List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence should be a list of strings + (pretokenized string). + boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`): + Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale. + word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`): + Word-level integer labels (for token classification tasks such as FUNSD, CORD). + """ + # Input type checking for clearer error + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False + + if text_pair is not None: + # in case text + text_pair are provided, text = questions, text_pair = words + if not _is_valid_text_input(text): + raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ") + if not isinstance(text_pair, (list, tuple)): + raise ValueError( + "words must of type `List[str]` (single pretokenized example), " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + else: + # in case only text is provided => must be words + if not isinstance(text, (list, tuple)): + raise ValueError( + "Words must of type `List[str]` (single pretokenized example), " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + if text_pair is not None: + is_batched = isinstance(text, (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + + words = text if text_pair is None else text_pair + if boxes is None: + raise ValueError("You must provide corresponding bounding boxes") + if is_batched: + if len(words) != len(boxes): + raise ValueError("You must provide words and boxes for an equal amount of examples") + for words_example, boxes_example in zip(words, boxes): + if len(words_example) != len(boxes_example): + raise ValueError("You must provide as many words as there are bounding boxes") + else: + if len(words) != len(boxes): + raise ValueError("You must provide as many words as there are bounding boxes") + + if is_batched: + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}." + ) + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + is_pair = bool(text_pair is not None) + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + is_pair=is_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + ], + is_pair: bool = None, + boxes: Optional[List[List[List[int]]]] = None, + word_labels: Optional[List[List[int]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + batch_outputs = self._batch_prepare_for_model( + batch_text_or_text_pairs=batch_text_or_text_pairs, + is_pair=is_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + ) + + return BatchEncoding(batch_outputs) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def _batch_prepare_for_model( + self, + batch_text_or_text_pairs, + is_pair: bool = None, + boxes: Optional[List[List[int]]] = None, + word_labels: Optional[List[List[int]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> BatchEncoding: + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens + + Args: + batch_ids_pairs: list of tokenized input ids or input ids pairs + """ + + batch_outputs = {} + for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)): + batch_text_or_text_pair, boxes_example = example + outputs = self.prepare_for_model( + batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair, + batch_text_or_text_pair[1] if is_pair else None, + boxes_example, + word_labels=word_labels[idx] if word_labels is not None else None, + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterward + return_attention_mask=False, # we pad in batch afterward + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[PreTokenizedInput] = None, + boxes: Optional[List[List[int]]] = None, + word_labels: Optional[List[int]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + return self.prepare_for_model( + text=text, + text_pair=text_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def prepare_for_model( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[PreTokenizedInput] = None, + boxes: Optional[List[List[int]]] = None, + word_labels: Optional[List[int]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + prepend_batch_axis: bool = False, + **kwargs + ) -> BatchEncoding: + """ + Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens, + truncates sequences if overflowing while taking into account the special tokens and manages a moving window + (with user defined stride) for overflowing tokens. + + Word-level :obj:`boxes` are turned into token-level :obj:`bbox`. If provided, word-level :obj:`word_labels` are + turned into token-level :obj:`labels`. The word label is used for the first token of the word, while remaining + tokens are labeled with -100, such that they will be ignored by the loss function. + + Args: + text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): + The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings. + text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`): + Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a + list of list of strings (words of a batch of examples). + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + tokens = [] + pair_tokens = [] + token_boxes = [] + pair_token_boxes = [] + labels = [] + + if text_pair is None: + if word_labels is None: + # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference) + for word, box in zip(text, boxes): + if len(word) < 1: # skip empty words + continue + word_tokens = self.tokenize(word) + tokens.extend(word_tokens) + token_boxes.extend([box] * len(word_tokens)) + else: + # CASE 2: token classification (training) + for word, box, label in zip(text, boxes, word_labels): + if len(word) < 1: # skip empty words + continue + word_tokens = self.tokenize(word) + tokens.extend(word_tokens) + token_boxes.extend([box] * len(word_tokens)) + if self.only_label_first_subword: + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1)) + else: + labels.extend([label] * len(word_tokens)) + else: + # CASE 3: document visual question answering (inference) + # text = question + # text_pair = words + tokens = self.tokenize(text) + token_boxes = [self.pad_token_box for _ in range(len(tokens))] + [self.sep_token_box] + + for word, box in zip(text_pair, boxes): + if len(word) < 1: # skip empty words + continue + word_tokens = self.tokenize(word) + pair_tokens.extend(word_tokens) + pair_token_boxes.extend([box] * len(word_tokens)) + + # Create ids + pair_ids + ids = self.convert_tokens_to_ids(tokens) + pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None + + # Compute the total size of the returned encodings + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + + # Truncation: Handle max sequence length + overflowing_tokens = [] + overflowing_token_boxes = [] + overflowing_labels = [] + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: + ( + ids, + token_boxes, + pair_ids, + pair_token_boxes, + labels, + overflowing_tokens, + overflowing_token_boxes, + overflowing_labels, + ) = self.truncate_sequences( + ids, + token_boxes, + pair_ids=pair_ids, + pair_token_boxes=pair_token_boxes, + labels=labels, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + + if return_token_type_ids and not add_special_tokens: + raise ValueError( + "Asking to return token_type_ids while setting add_special_tokens to False " + "results in an undefined behavior. Please set add_special_tokens to True or " + "set return_token_type_ids to None." + ) + + # Load from model defaults + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes + encoded_inputs["overflowing_labels"] = overflowing_labels + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Add special tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box] + if pair_token_boxes: + pair_token_boxes = pair_token_boxes + [self.sep_token_box] + if labels: + labels = [self.pad_token_label] + labels + [self.pad_token_label] + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) + + # Build output dictionary + encoded_inputs["input_ids"] = sequence + encoded_inputs["bbox"] = token_boxes + pair_token_boxes + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + if labels: + encoded_inputs["labels"] = labels + + # Check lengths + self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose) + + # Padding + if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: + encoded_inputs = self.pad( + encoded_inputs, + max_length=max_length, + padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + if return_length: + encoded_inputs["length"] = len(encoded_inputs["input_ids"]) + + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis + ) + + return batch_outputs + + def truncate_sequences( + self, + ids: List[int], + token_boxes: List[List[int]], + pair_ids: Optional[List[int]] = None, + pair_token_boxes: Optional[List[List[int]]] = None, + labels: Optional[List[int]] = None, + num_tokens_to_remove: int = 0, + truncation_strategy: Union[str, TruncationStrategy] = "longest_first", + stride: int = 0, + ) -> Tuple[List[int], List[int], List[int]]: + """ + Truncates a sequence pair in-place following the strategy. + + Args: + ids (:obj:`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize`` + and ``convert_tokens_to_ids`` methods. + token_boxes (:obj:`List[List[int]]`): + Bounding boxes of the first sequence. + pair_ids (:obj:`List[int]`, `optional`): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize`` + and ``convert_tokens_to_ids`` methods. + pair_token_boxes (:obj:`List[List[int]]`, `optional`): + Bounding boxes of the second sequence. + labels (:obj:`List[int]`, `optional`): + Labels of the first sequence (for token classification tasks). + num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0): + Number of tokens to remove using the truncation strategy. + truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`): + The strategy to follow for truncation. Can be: + + * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + stride (:obj:`int`, `optional`, defaults to 0): + If set to a positive number, the overflowing tokens returned will contain some tokens from the main + sequence returned. The value of this argument defines the number of additional tokens. + + Returns: + :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the + list of overflowing tokens. + """ + if num_tokens_to_remove <= 0: + return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], [] + + if not isinstance(truncation_strategy, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation_strategy) + + overflowing_tokens = [] + overflowing_token_boxes = [] + overflowing_labels = [] + if truncation_strategy == TruncationStrategy.LONGEST_FIRST: + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + if not overflowing_tokens: + window_len = min(len(ids), stride + 1) + else: + window_len = 1 + overflowing_tokens.extend(ids[-window_len:]) + overflowing_token_boxes.extend(token_boxes[-window_len:]) + overflowing_labels.extend(labels[-window_len:]) + ids = ids[:-1] + token_boxes = token_boxes[:-1] + labels = labels[:-1] + else: + if not overflowing_tokens: + window_len = min(len(pair_ids), stride + 1) + else: + window_len = 1 + overflowing_tokens.extend(pair_ids[-window_len:]) + overflowing_token_boxes.extend(pair_token_boxes[-window_len:]) + pair_ids = pair_ids[:-1] + pair_token_boxes = pair_token_boxes[:-1] + elif truncation_strategy == TruncationStrategy.ONLY_FIRST: + if len(ids) > num_tokens_to_remove: + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + overflowing_token_boxes = token_boxes[-window_len:] + overflowing_labels = labels[-window_len:] + ids = ids[:-num_tokens_to_remove] + token_boxes = token_boxes[:-num_tokens_to_remove] + labels = labels[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the first sequence has a length {len(ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + f"for instance 'longest_first' or 'only_second'." + ) + elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: + if len(pair_ids) > num_tokens_to_remove: + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + overflowing_token_boxes = pair_token_boxes[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the second sequence has a length {len(pair_ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + f"for instance 'longest_first' or 'only_first'." + ) + + return ( + ids, + token_boxes, + pair_ids, + pair_token_boxes, + labels, + overflowing_tokens, + overflowing_token_boxes, + overflowing_labels, + ) + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + if needs_to_be_padded: + difference = max_length - len(required_input) + if self.padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference + if "labels" in encoded_inputs: + encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif self.padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"] + if "labels" in encoded_inputs: + encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + + return encoded_inputs diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py new file mode 100644 index 0000000000..4f91da1f1c --- /dev/null +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py @@ -0,0 +1,694 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +""" Tokenization classes for LayoutXLM model.""" + + +import os +from shutil import copyfile +from typing import Dict, List, Optional, Tuple, Union + +from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING + +from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available +from ...tokenization_utils import AddedToken +from ...tokenization_utils_base import ( + ENCODE_KWARGS_DOCSTRING, + BatchEncoding, + EncodedInput, + PreTokenizedInput, + TextInput, + TextInputPair, + TruncationStrategy, +) +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from ..xlm_roberta.tokenization_xlm_roberta import ( + PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, + PRETRAINED_VOCAB_FILES_MAP, + VOCAB_FILES_NAMES, +) + + +if is_sentencepiece_available(): + from .tokenization_layoutxlm import LayoutXLMTokenizer +else: + LayoutXLMTokenizer = None + + +logger = logging.get_logger(__name__) + + +class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from + :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE + `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`): + The bounding box to use for the special [CLS] token. + sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`): + The bounding box to use for the special [SEP] token. + pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`): + The bounding box to use for the special [PAD] token. + pad_token_label (:obj:`int`, `optional`, defaults to -100): + The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's + CrossEntropyLoss. + only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to only label the first subword, in case word labels are provided. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = LayoutXLMTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + cls_token_box=[0, 0, 0, 0], + sep_token_box=[1000, 1000, 1000, 1000], + pad_token_box=[0, 0, 0, 0], + pad_token_label=-100, + only_label_first_subword=True, + **kwargs + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + **kwargs, + ) + + self.vocab_file = vocab_file + self.can_save_slow_tokenizer = False if not self.vocab_file else True + + # additional properties + self.cls_token_box = cls_token_box + self.sep_token_box = sep_token_box + self.pad_token_box = pad_token_box + self.pad_token_label = pad_token_label + self.only_label_first_subword = only_label_first_subword + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None, + boxes: Union[List[List[int]], List[List[List[int]]]] = None, + word_labels: Optional[Union[List[int], List[List[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences with word-level normalized bounding boxes and optional labels. + + Args: + text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings + (words of a single example or questions of a batch of examples) or a list of list of strings (batch of + words). + text_pair (:obj:`List[str]`, :obj:`List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence should be a list of strings + (pretokenized string). + boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`): + Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale. + word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`): + Word-level integer labels (for token classification tasks such as FUNSD, CORD). + """ + # Input type checking for clearer error + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False + + if text_pair is not None: + # in case text + text_pair are provided, text = questions, text_pair = words + if not _is_valid_text_input(text): + raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ") + if not isinstance(text_pair, (list, tuple)): + raise ValueError( + "words must of type `List[str]` (single pretokenized example), " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + else: + # in case only text is provided => must be words + if not isinstance(text, (list, tuple)): + raise ValueError( + "Words must of type `List[str]` (single pretokenized example), " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + if text_pair is not None: + is_batched = isinstance(text, (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + + words = text if text_pair is None else text_pair + if boxes is None: + raise ValueError("You must provide corresponding bounding boxes") + if is_batched: + if len(words) != len(boxes): + raise ValueError("You must provide words and boxes for an equal amount of examples") + for words_example, boxes_example in zip(words, boxes): + if len(words_example) != len(boxes_example): + raise ValueError("You must provide as many words as there are bounding boxes") + else: + if len(words) != len(boxes): + raise ValueError("You must provide as many words as there are bounding boxes") + + if is_batched: + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}." + ) + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + is_pair = bool(text_pair is not None) + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + is_pair=is_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]: + batched_input = [(text, pair)] if pair else [text] + encodings = self._tokenizer.encode_batch( + batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs + ) + + return encodings[0].tokens + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + ], + is_pair: bool = None, + boxes: Optional[List[List[List[int]]]] = None, + word_labels: Optional[List[List[int]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + + if not isinstance(batch_text_or_text_pairs, list): + raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") + + # Set the truncation and padding strategy and restore the initial configuration + self.set_truncation_and_padding( + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + ) + + if is_pair: + batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs] + + encodings = self._tokenizer.encode_batch( + batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + is_pretokenized=True, # we set this to True as LayoutLMv2 always expects pretokenized inputs + ) + + # Convert encoding to dict + # `Tokens` has type: Tuple[ + # List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]], + # List[EncodingFast] + # ] + # with nested dimensions corresponding to batch, overflows, sequence length + tokens_and_encodings = [ + self._convert_encoding( + encoding=encoding, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=True + if word_labels is not None + else return_offsets_mapping, # we use offsets to create the labels + return_length=return_length, + verbose=verbose, + ) + for encoding in encodings + ] + + # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension + # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length) + # (we say ~ because the number of overflow varies with the example in the batch) + # + # To match each overflowing sample with the original sample in the batch + # we add an overflow_to_sample_mapping array (see below) + sanitized_tokens = {} + for key in tokens_and_encodings[0][0].keys(): + stack = [e for item, _ in tokens_and_encodings for e in item[key]] + sanitized_tokens[key] = stack + sanitized_encodings = [e for _, item in tokens_and_encodings for e in item] + + # If returning overflowing tokens, we need to return a mapping + # from the batch idx to the original sample + if return_overflowing_tokens: + overflow_to_sample_mapping = [] + for i, (toks, _) in enumerate(tokens_and_encodings): + overflow_to_sample_mapping += [i] * len(toks["input_ids"]) + sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping + + for input_ids in sanitized_tokens["input_ids"]: + self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose) + + # create the token boxes + token_boxes = [] + for batch_index in range(len(sanitized_tokens["input_ids"])): + if return_overflowing_tokens: + original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index] + else: + original_index = batch_index + token_boxes_example = [] + for id, sequence_id, word_id in zip( + sanitized_tokens["input_ids"][batch_index], + sanitized_encodings[batch_index].sequence_ids, + sanitized_encodings[batch_index].word_ids, + ): + if word_id is not None: + if is_pair and sequence_id == 0: + token_boxes_example.append(self.pad_token_box) + else: + token_boxes_example.append(boxes[original_index][word_id]) + else: + if id == self.cls_token_id: + token_boxes_example.append(self.cls_token_box) + elif id == self.sep_token_id: + token_boxes_example.append(self.sep_token_box) + elif id == self.pad_token_id: + token_boxes_example.append(self.pad_token_box) + else: + raise ValueError("Id not recognized") + token_boxes.append(token_boxes_example) + + sanitized_tokens["bbox"] = token_boxes + + # optionally, create the labels + if word_labels is not None: + labels = [] + for batch_index in range(len(sanitized_tokens["input_ids"])): + if return_overflowing_tokens: + original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index] + else: + original_index = batch_index + labels_example = [] + for id, offset, word_id in zip( + sanitized_tokens["input_ids"][batch_index], + sanitized_tokens["offset_mapping"][batch_index], + sanitized_encodings[batch_index].word_ids, + ): + if word_id is not None: + if self.only_label_first_subword: + if offset[0] == 0: + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + labels_example.append(word_labels[original_index][word_id]) + else: + labels_example.append(self.pad_token_label) + else: + labels_example.append(word_labels[original_index][word_id]) + else: + labels_example.append(self.pad_token_label) + labels.append(labels_example) + + sanitized_tokens["labels"] = labels + # finally, remove offsets if the user didn't want them + if not return_offsets_mapping: + del sanitized_tokens["offset_mapping"] + + return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors) + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[PreTokenizedInput] = None, + boxes: Optional[List[List[int]]] = None, + word_labels: Optional[List[int]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[bool] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + + # make it a batched input + # 2 options: + # 1) only text, in case text must be a list of str + # 2) text + text_pair, in which case text = str and text_pair a list of str + batched_input = [(text, text_pair)] if text_pair else [text] + batched_boxes = [boxes] + batched_word_labels = [word_labels] if word_labels is not None else None + batched_output = self._batch_encode_plus( + batched_input, + is_pair=bool(text_pair is not None), + boxes=batched_boxes, + word_labels=batched_word_labels, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + # Return tensor is None, then we can remove the leading batch axis + # Overflowing tokens are returned as a batch of output so we keep them in this case + if return_tensors is None and not return_overflowing_tokens: + batched_output = BatchEncoding( + { + key: value[0] if len(value) > 0 and isinstance(value[0], list) else value + for key, value in batched_output.items() + }, + batched_output.encodings, + ) + + self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose) + + return batched_output + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + if needs_to_be_padded: + difference = max_length - len(required_input) + if self.padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference + if "labels" in encoded_inputs: + encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif self.padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"] + if "labels" in encoded_inputs: + encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + + return encoded_inputs + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM-RoBERTa sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory.") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py index cdc2a892f8..ba59da4c0b 100644 --- a/src/transformers/utils/dummy_sentencepiece_objects.py +++ b/src/transformers/utils/dummy_sentencepiece_objects.py @@ -65,6 +65,15 @@ class DebertaV2Tokenizer: requires_backends(cls, ["sentencepiece"]) +class LayoutXLMTokenizer: + def __init__(self, *args, **kwargs): + requires_backends(self, ["sentencepiece"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["sentencepiece"]) + + class M2M100Tokenizer: def __init__(self, *args, **kwargs): requires_backends(self, ["sentencepiece"]) diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index c5887c55d9..d641b49ead 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -200,6 +200,15 @@ class LayoutLMv2TokenizerFast: requires_backends(cls, ["tokenizers"]) +class LayoutXLMTokenizerFast: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tokenizers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tokenizers"]) + + class LEDTokenizerFast: def __init__(self, *args, **kwargs): requires_backends(self, ["tokenizers"]) diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 3276185d0f..615c20e080 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -50,6 +50,15 @@ class LayoutLMv2Processor: requires_backends(cls, ["vision"]) +class LayoutXLMProcessor: + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["vision"]) + + class SegformerFeatureExtractor: def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) diff --git a/tests/test_processor_layoutxlm.py b/tests/test_processor_layoutxlm.py new file mode 100644 index 0000000000..0ba24d3dac --- /dev/null +++ b/tests/test_processor_layoutxlm.py @@ -0,0 +1,423 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest +from typing import List + +from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast +from transformers.file_utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available +from transformers.models.layoutxlm import LayoutXLMTokenizer, LayoutXLMTokenizerFast +from transformers.testing_utils import ( + require_pytesseract, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, +) + + +if is_pytesseract_available(): + from PIL import Image + + from transformers import LayoutLMv2FeatureExtractor, LayoutXLMProcessor + + +SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") + + +@require_pytesseract +@require_sentencepiece +@require_tokenizers +class LayoutXLMProcessorTest(unittest.TestCase): + tokenizer_class = LayoutXLMTokenizer + rust_tokenizer_class = LayoutXLMTokenizerFast + + def setUp(self): + feature_extractor_map = { + "do_resize": True, + "size": 224, + "apply_ocr": True, + } + + self.tmpdirname = tempfile.mkdtemp() + self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) + with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(feature_extractor_map) + "\n") + + def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: + return self.tokenizer_class.from_pretrained(SAMPLE_SP, **kwargs) + + def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: + return self.rust_tokenizer_class.from_pretrained(SAMPLE_SP, **kwargs) + + def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]: + return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)] + + def get_feature_extractor(self, **kwargs): + return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_default(self): + feature_extractor = self.get_feature_extractor() + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) + + processor.save_pretrained(self.tmpdirname) + processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) + self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) + + def test_save_load_pretrained_additional_features(self): + processor = LayoutXLMProcessor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer()) + processor.save_pretrained(self.tmpdirname) + + # slow tokenizer + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30) + + processor = LayoutXLMProcessor.from_pretrained( + self.tmpdirname, + use_fast=False, + bos_token="(BOS)", + eos_token="(EOS)", + do_resize=False, + size=30, + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) + + # fast tokenizer + tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30) + + processor = LayoutXLMProcessor.from_pretrained( + self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) + + +# different use cases tests +@require_sentencepiece +@require_torch +@require_pytesseract +class LayoutXLMProcessorIntegrationTests(unittest.TestCase): + @cached_property + def get_images(self): + # we verify our implementation on 2 document images from the DocVQA dataset + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") + + image_1 = Image.open(ds[0]["file"]).convert("RGB") + image_2 = Image.open(ds[1]["file"]).convert("RGB") + + return image_1, image_2 + + @cached_property + def get_tokenizers(self): + slow_tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + fast_tokenizer = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") + return [slow_tokenizer, fast_tokenizer] + + @slow + def test_processor_case_1(self): + # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True + + feature_extractor = LayoutLMv2FeatureExtractor() + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) + + # not batched + input_feat_extract = feature_extractor(images[0], return_tensors="pt") + input_processor = processor(images[0], return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify image + self.assertAlmostEqual( + input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2 + ) + + # verify input_ids + # fmt: off + expected_decoding = " 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer" # noqa: E231 + # fmt: on + decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + input_feat_extract = feature_extractor(images, return_tensors="pt") + input_processor = processor(images, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify images + self.assertAlmostEqual( + input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2 + ) + + # verify input_ids + # fmt: off + expected_decoding = " 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223" # noqa: E231 + # fmt: on + decoding = tokenizer.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + @slow + def test_processor_case_2(self): + # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False + + feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) + + # not batched + words = ["hello", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt") + + # verify keys + expected_keys = ["input_ids", "bbox", "attention_mask", "image"] + actual_keys = list(input_processor.keys()) + for key in expected_keys: + self.assertIn(key, actual_keys) + + # verify input_ids + expected_decoding = " hello world" + decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " hello world" + decoding = tokenizer.decode(input_processor.input_ids[0].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [ + [0, 0, 0, 0], + [3, 2, 5, 1], + [6, 7, 4, 2], + [3, 9, 2, 4], + [1, 1, 2, 3], + [1, 1, 2, 3], + [1000, 1000, 1000, 1000], + ] + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + @slow + def test_processor_case_3(self): + # case 3: token classification (training), apply_ocr=False + + feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) + + # not batched + words = ["weirdly", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + word_labels = [1, 2] + input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " weirdly world" + decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify labels + expected_labels = [-100, 1, -100, 2, -100] + self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels) + + # batched + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + word_labels = [[1, 2], [6, 3, 10, 2]] + input_processor = processor( + images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt" + ) + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " my name is niels" + decoding = tokenizer.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [ + [0, 0, 0, 0], + [3, 2, 5, 1], + [6, 7, 4, 2], + [3, 9, 2, 4], + [1, 1, 2, 3], + [1, 1, 2, 3], + [1000, 1000, 1000, 1000], + ] + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + # verify labels + expected_labels = [-100, 6, 3, 10, 2, -100, -100] + self.assertListEqual(input_processor.labels[1].tolist(), expected_labels) + + @slow + def test_processor_case_4(self): + # case 4: visual question answering (inference), apply_ocr=True + + feature_extractor = LayoutLMv2FeatureExtractor() + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) + + # not batched + question = "What's his name?" + input_processor = processor(images[0], question, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + # fmt: off + expected_decoding = " What's his name? 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer" # noqa: E231 + # fmt: on + decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + questions = ["How old is he?", "what's the time"] + input_processor = processor( + images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt" + ) + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " what's the time 7 ITC Limited REPORT AND ACCOUNTS 2013" + decoding = tokenizer.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + # fmt: off + expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [1000, 1000, 1000, 1000]] # noqa: E231 + # fmt: on + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + @slow + def test_processor_case_5(self): + # case 5: visual question answering (inference), apply_ocr=False + + feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) + + # not batched + question = "What's his name?" + words = ["hello", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + input_processor = processor(images[0], question, words, boxes, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " What's his name? hello world" + decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + questions = ["How old is he?", "what's the time"] + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(list(input_processor.keys())) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " How old is he? hello world" + decoding = tokenizer.decode(input_processor.input_ids[0].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + expected_decoding = " what's the time my name is niels" + decoding = tokenizer.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]] + self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox) diff --git a/tests/test_tokenization_layoutxlm.py b/tests/test_tokenization_layoutxlm.py new file mode 100644 index 0000000000..025a0ef29a --- /dev/null +++ b/tests/test_tokenization_layoutxlm.py @@ -0,0 +1,1785 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import os +import shutil +import tempfile +import unittest +from typing import List + +from transformers import AddedToken, LayoutXLMTokenizerFast, SpecialTokensMixin, is_tf_available, is_torch_available +from transformers.models.layoutxlm.tokenization_layoutxlm import LayoutXLMTokenizer +from transformers.testing_utils import ( + is_pt_tf_cross_test, + require_pandas, + require_scatter, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, +) + +from .test_tokenization_common import ( + SMALL_TRAINING_CORPUS, + TokenizerTesterMixin, + filter_non_english, + merge_model_tokenizer_mappings, +) + + +SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") + + +@require_sentencepiece +@require_tokenizers +@require_pandas +class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = LayoutXLMTokenizer + rust_tokenizer_class = LayoutXLMTokenizerFast + test_rust_tokenizer = True + from_pretrained_filter = filter_non_english + test_seq2seq = False + test_sentencepiece = True + maxDiff = None + + def get_words_and_boxes(self): + words = ["a", "weirdly", "test"] + boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]] + + return words, boxes + + def get_words_and_boxes_batch(self): + words = [["a", "weirdly", "test"], ["hello", "my", "name", "is", "bob"]] + boxes = [ + [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]], + [[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]], + ] + + return words, boxes + + def get_question_words_and_boxes(self): + question = "what's his name?" + words = ["a", "weirdly", "test"] + boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]] + + return question, words, boxes + + def get_question_words_and_boxes_batch(self): + questions = ["what's his name?", "how is he called?"] + words = [["a", "weirdly", "test"], ["what", "a", "laif", "gastn"]] + boxes = [ + [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]], + [[256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]], + ] + + return questions, words, boxes + + def setUp(self): + super().setUp() + + # We have a SentencePiece fixture for testing + tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.save_pretrained(self.tmpdirname) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00E9d,running" + output_text = "unwanted, running" + return input_text, output_text + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base") + + question, words, boxes = self.get_question_words_and_boxes() + + text = tokenizer.encode( + question.split(), + boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))], + add_special_tokens=False, + ) + text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2] + + def test_offsets_with_special_characters(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + words, boxes = self.get_words_and_boxes() + words[1] = tokenizer_r.mask_token + tokens = tokenizer_r.encode_plus( + words, + boxes=boxes, + return_attention_mask=False, + return_token_type_ids=False, + return_offsets_mapping=True, + add_special_tokens=True, + ) + + expected_results = [ + ((0, 0), tokenizer_r.cls_token), + ((0, 1), "▁a"), + ((0, 6), tokenizer_r.mask_token), + ((0, 4), "▁test"), + ((0, 0), tokenizer_r.sep_token), + ] + + self.assertEqual( + [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]) + ) + self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) + + def test_add_special_tokens(self): + tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + + special_token = "[SPECIAL_TOKEN]" + special_token_box = [1000, 1000, 1000, 1000] + + tokenizer.add_special_tokens({"cls_token": special_token}) + encoded_special_token = tokenizer.encode( + [special_token], boxes=[special_token_box], add_special_tokens=False + ) + self.assertEqual(len(encoded_special_token), 1) + + decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True) + self.assertTrue(special_token not in decoded) + + def test_add_tokens_tokenizer(self): + tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + vocab_size = tokenizer.vocab_size + all_size = len(tokenizer) + + self.assertNotEqual(vocab_size, 0) + + # We usually have added tokens from the start in tests because our vocab fixtures are + # smaller than the original vocabs - let's not assert this + # self.assertEqual(vocab_size, all_size) + + new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"] + added_toks = tokenizer.add_tokens(new_toks) + vocab_size_2 = tokenizer.vocab_size + all_size_2 = len(tokenizer) + + self.assertNotEqual(vocab_size_2, 0) + self.assertEqual(vocab_size, vocab_size_2) + self.assertEqual(added_toks, len(new_toks)) + self.assertEqual(all_size_2, all_size + len(new_toks)) + + words = "aaaaa bbbbbb low cccccccccdddddddd l".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + self.assertGreaterEqual(len(tokens), 4) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + + new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} + added_toks_2 = tokenizer.add_special_tokens(new_toks_2) + vocab_size_3 = tokenizer.vocab_size + all_size_3 = len(tokenizer) + + self.assertNotEqual(vocab_size_3, 0) + self.assertEqual(vocab_size, vocab_size_3) + self.assertEqual(added_toks_2, len(new_toks_2)) + self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) + + words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + tokens = tokenizer.encode( + words, + boxes=boxes, + add_special_tokens=False, + ) + + self.assertGreaterEqual(len(tokens), 6) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[0], tokens[1]) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokens[-3]) + self.assertEqual(tokens[0], tokenizer.eos_token_id) + self.assertEqual(tokens[-2], tokenizer.pad_token_id) + + @require_tokenizers + def test_encode_decode_with_spaces(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)] + tokenizer.add_tokens(new_toks) + input = "[ABC][DEF][ABC][DEF]" + if self.space_between_special_tokens: + output = "[ABC] [DEF] [ABC] [DEF]" + else: + output = input + encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False) + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) + self.assertIn(decoded, [output, output.lower()]) + + def test_encode_plus_with_padding(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + padding_size = 10 + padding_idx = tokenizer.pad_token_id + + encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True) + input_ids = encoded_sequence["input_ids"] + special_tokens_mask = encoded_sequence["special_tokens_mask"] + sequence_length = len(input_ids) + + # Test 'longest' and 'no_padding' don't do anything + tokenizer.padding_side = "right" + + not_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + padding=False, + return_special_tokens_mask=True, + ) + not_padded_input_ids = not_padded_sequence["input_ids"] + + not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] + not_padded_sequence_length = len(not_padded_input_ids) + + self.assertTrue(sequence_length == not_padded_sequence_length) + self.assertTrue(input_ids == not_padded_input_ids) + self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) + + not_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + padding=False, + return_special_tokens_mask=True, + ) + not_padded_input_ids = not_padded_sequence["input_ids"] + + not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] + not_padded_sequence_length = len(not_padded_input_ids) + + self.assertTrue(sequence_length == not_padded_sequence_length) + self.assertTrue(input_ids == not_padded_input_ids) + self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) + + # Test right padding + tokenizer.padding_side = "right" + + right_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + max_length=sequence_length + padding_size, + padding="max_length", + return_special_tokens_mask=True, + ) + right_padded_input_ids = right_padded_sequence["input_ids"] + + right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] + right_padded_sequence_length = len(right_padded_input_ids) + + self.assertTrue(sequence_length + padding_size == right_padded_sequence_length) + self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids) + self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) + + # Test left padding + tokenizer.padding_side = "left" + left_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + max_length=sequence_length + padding_size, + padding="max_length", + return_special_tokens_mask=True, + ) + left_padded_input_ids = left_padded_sequence["input_ids"] + left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] + left_padded_sequence_length = len(left_padded_input_ids) + + self.assertTrue(sequence_length + padding_size == left_padded_sequence_length) + self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids) + self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask) + + if "token_type_ids" in tokenizer.model_input_names: + token_type_ids = encoded_sequence["token_type_ids"] + left_padded_token_type_ids = left_padded_sequence["token_type_ids"] + right_padded_token_type_ids = right_padded_sequence["token_type_ids"] + + assert token_type_ids + [0] * padding_size == right_padded_token_type_ids + assert [0] * padding_size + token_type_ids == left_padded_token_type_ids + + if "attention_mask" in tokenizer.model_input_names: + attention_mask = encoded_sequence["attention_mask"] + right_padded_attention_mask = right_padded_sequence["attention_mask"] + left_padded_attention_mask = left_padded_sequence["attention_mask"] + + self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask) + self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask) + + def test_internal_consistency(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + tokens = [] + for word in words: + tokens.extend(tokenizer.tokenize(word)) + ids = tokenizer.convert_tokens_to_ids(tokens) + ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + self.assertListEqual(ids, ids_2) + + tokens_2 = tokenizer.convert_ids_to_tokens(ids) + self.assertNotEqual(len(tokens_2), 0) + text_2 = tokenizer.decode(ids) + self.assertIsInstance(text_2, str) + + output_text = "a weirdly test" + self.assertEqual(text_2, output_text) + + def test_mask_output(self): + tokenizers = self.get_tokenizers(fast=False, do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + if ( + tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer" + and "token_type_ids" in tokenizer.model_input_names + ): + information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True) + sequences, mask = information["input_ids"], information["token_type_ids"] + self.assertEqual(len(sequences), len(mask)) + + def test_number_of_added_tokens(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + + # test 1: single sequence + words, boxes = self.get_words_and_boxes() + + sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual( + tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences) + ) + + # test 2: two sequences + question, words, boxes = self.get_question_words_and_boxes() + + sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False) + attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True) + + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual( + tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) + ) + + def test_padding_to_max_length(self): + """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + padding_size = 10 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + padding_idx = tokenizer.pad_token_id + + # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + # FIXME: the next line should be padding(max_length) to avoid warning + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + def test_padding(self, max_length=50): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + # Encode - Simple input + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode(words, boxes=boxes, padding=True) + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode - Pair input + question, words, boxes = self.get_question_words_and_boxes() + input_r = tokenizer_r.encode( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True) + input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest") + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode_plus - Simple input + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Encode_plus - Pair input + question, words, boxes = self.get_question_words_and_boxes() + input_r = tokenizer_r.encode_plus( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus( + question, words, boxes=boxes, max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + question, words, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Batch_encode_plus - Simple input + words, boxes = self.get_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + pad_to_max_length=True, + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + pad_to_max_length=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="longest", + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Batch_encode_plus - Pair input + questions, words, boxes = self.get_question_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + max_length=max_length, + truncation=True, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + max_length=max_length, + truncation=True, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + padding=True, + ) + input_p = tokenizer_p.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + padding="longest", + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad on single examples after tokenization + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.encode_plus(words, boxes=boxes) + input_p = tokenizer_r.pad(input_p) + + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.encode_plus(words, boxes=boxes) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + # Using pad after tokenization + words, boxes = self.get_words_and_boxes_batch() + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_p = tokenizer_r.pad(input_p) + + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad after tokenization + words, boxes = self.get_words_and_boxes_batch() + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Test not batched + words, boxes = self.get_words_and_boxes() + encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + # Test not batched pairs + question, words, boxes = self.get_question_words_and_boxes() + encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + # Test batched + words, boxes = self.get_words_and_boxes_batch() + encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + def test_batch_encode_plus_batch_sequence_length(self): + # Tests that all encoded values have the correct size + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + encoded_sequences = [ + tokenizer.encode_plus(words_example, boxes=boxes_example) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + maximum_length = len( + max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len) + ) + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences_padded = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + + encoded_sequences_batch_padded = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=True + ) + self.assertListEqual( + encoded_sequences_padded, + self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded), + ) + + # check 'longest' is unsensitive to a max length + encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=True + ) + encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest" + ) + for key in encoded_sequences_batch_padded_1.keys(): + self.assertListEqual( + encoded_sequences_batch_padded_1[key], + encoded_sequences_batch_padded_2[key], + ) + + # check 'no_padding' is unsensitive to a max length + encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=False + ) + encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False + ) + for key in encoded_sequences_batch_padded_1.keys(): + self.assertListEqual( + encoded_sequences_batch_padded_1[key], + encoded_sequences_batch_padded_2[key], + ) + + @unittest.skip("batch_encode_plus does not handle overflowing tokens.") + def test_batch_encode_plus_overflowing_tokens(self): + pass + + def test_batch_encode_plus_padding(self): + # Test that padded sequences are equivalent between batch_encode_plus and encode_plus + + # Right padding tests + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + max_length = 100 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=max_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + # Left padding tests + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokenizer.padding_side = "left" + words, boxes = self.get_words_and_boxes_batch() + + max_length = 100 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=max_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + def test_padding_to_multiple_of(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.pad_token is None: + self.skipTest("No padding token.") + else: + words, boxes = self.get_words_and_boxes() + + # empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8) + normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8) + # for key, value in empty_tokens.items(): + # self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # Should also work with truncation + normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # truncation to something which is not a multiple of pad_to_multiple_of raises an error + self.assertRaises( + ValueError, + tokenizer.__call__, + words, + boxes=boxes, + padding=True, + truncation=True, + max_length=12, + pad_to_multiple_of=8, + ) + + def test_tokenizer_slow_store_full_signature(self): + signature = inspect.signature(self.tokenizer_class.__init__) + tokenizer = self.get_tokenizer() + + for parameter_name, parameter in signature.parameters.items(): + if parameter.default != inspect.Parameter.empty: + self.assertIn(parameter_name, tokenizer.init_kwargs) + + def test_build_inputs_with_special_tokens(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Input tokens id + words, boxes = self.get_words_and_boxes() + input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False) + input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False) + + # Generate output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + self.assertEqual(output_p, output_r) + + def test_special_tokens_mask_input_pairs(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + return_special_tokens_mask=True, + # add_prefix_space=False, + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_special_tokens_mask(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + # Testing single inputs + encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + self.assertNotEqual(tokenizer.model_max_length, 42) + + # Now let's start the test + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + words, boxes = self.get_words_and_boxes() + tmpdirname = tempfile.mkdtemp() + + before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + before_vocab = tokenizer.get_vocab() + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + after_vocab = after_tokenizer.get_vocab() + self.assertListEqual(before_tokens, after_tokens) + self.assertDictEqual(before_vocab, after_vocab) + + shutil.rmtree(tmpdirname) + + def test_right_and_left_padding(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + sequence = "Sequence" + padding_size = 10 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, sequence) + + padding_idx = tokenizer.pad_token_id + + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length" + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length" + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + + # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding' + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest") + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False) + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left + + def test_token_type_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + + # test 1: single sequence + words, boxes = self.get_words_and_boxes() + + output = tokenizer(words, boxes=boxes, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that the token type IDs have the same length as the attention mask + self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"])) + + self.assertIn(0, output["token_type_ids"]) + self.assertNotIn(1, output["token_type_ids"]) + + # test 2: two sequences (question + words) + question, words, boxes = self.get_question_words_and_boxes() + + output = tokenizer(question, words, boxes, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that the token type IDs have the same length as the attention mask + self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"])) + + self.assertIn(0, output["token_type_ids"]) + self.assertNotIn(1, output["token_type_ids"]) + + def test_offsets_mapping(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + text = ["a", "wonderful", "test"] + boxes = [[1, 8, 12, 20] for _ in range(len(text))] + + # No pair + tokens_with_offsets = tokenizer_r.encode_plus( + text, + boxes=boxes, + return_special_tokens_mask=True, + return_offsets_mapping=True, + add_special_tokens=True, + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(False) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + # Pairs + text = "what's his name" + pair = ["a", "wonderful", "test"] + boxes = [[1, 8, 12, 20] for _ in range(len(pair))] + tokens_with_offsets = tokenizer_r.encode_plus( + text, + pair, + boxes=boxes, + return_special_tokens_mask=True, + return_offsets_mapping=True, + add_special_tokens=True, + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(True) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + @require_torch + @slow + @require_scatter + def test_torch_encode_plus_sent_to_model(self): + import torch + + from transformers import MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + return + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + return + + model = model_class(config) + + # Make sure the model contains at least the full vocabulary size in its embedding matrix + is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight") + assert ( + (model.get_input_embeddings().weight.shape[0] >= len(tokenizer)) + if is_using_common_embeddings + else True + ) + + # Build sequence + words, boxes = self.get_words_and_boxes() + encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt") + batch_encoded_sequence = tokenizer.batch_encode_plus( + [words, words], [boxes, boxes], return_tensors="pt" + ) + # This should not fail + + with torch.no_grad(): # saves some time + model(**encoded_sequence) + model(**batch_encoded_sequence) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + return + + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + words, boxes = self.get_words_and_boxes() + + ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + self.assertListEqual(ids, rust_ids) + + def test_tokenization_python_rust_equals(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + words, boxes = self.get_words_and_boxes() + + # Ensure basic input match + input_p = tokenizer_p.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key]) + + input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes) + input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key]) + + words = ["hello" for _ in range(1000)] + boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)] + + # Ensure truncation match + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True) + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key]) + + # Ensure truncation with stride match + input_p = tokenizer_p.encode_plus( + words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + input_r = tokenizer_r.encode_plus( + words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key][0]) + + def test_embeded_special_tokens(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + words, boxes = self.get_words_and_boxes() + tokens_r = tokenizer_r.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + ) + tokens_p = tokenizer_p.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + ) + + for key in tokens_p.keys(): + self.assertEqual(tokens_r[key], tokens_p[key]) + + if "token_type_ids" in tokens_r: + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + + tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + self.assertSequenceEqual(tokens_r, tokens_p) + + def test_compare_add_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) + + words, boxes = self.get_words_and_boxes() + # tokenize() + no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False) + with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True) + self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add) + + # encode() + no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True) + self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add) + + # encode_plus() + no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True) + for key in no_special_tokens.keys(): + self.assertEqual( + len(no_special_tokens[key]), + len(with_special_tokens[key]) - simple_num_special_tokens_to_add, + ) + + # # batch_encode_plus + words, boxes = self.get_words_and_boxes_batch() + + no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True) + for key in no_special_tokens.keys(): + for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]): + self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add) + + @slow + def test_layoutxlm_truncation_integration_test(self): + words, boxes = self.get_words_and_boxes() + + tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", model_max_length=512) + + for i in range(12, 512): + new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True) + + # Ensure that the input IDs are less than the max length defined. + self.assertLessEqual(len(new_encoded_inputs), i) + + tokenizer.model_max_length = 20 + new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True) + dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True) + + # Ensure that the input IDs are still truncated when no max_length is specified + self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs) + self.assertLessEqual(len(new_encoded_inputs), 20) + + @is_pt_tf_cross_test + def test_batch_encode_plus_tensors(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + # A Tensor cannot be build by sequences which are not the same size + self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt") + self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf") + + if tokenizer.pad_token_id is None: + self.assertRaises( + ValueError, + tokenizer.batch_encode_plus, + words, + boxes=boxes, + padding=True, + return_tensors="pt", + ) + self.assertRaises( + ValueError, + tokenizer.batch_encode_plus, + words, + boxes=boxes, + padding="longest", + return_tensors="tf", + ) + else: + pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt") + tensorflow_tensor = tokenizer.batch_encode_plus( + words, boxes=boxes, padding="longest", return_tensors="tf" + ) + encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True) + + for key in encoded_sequences.keys(): + pytorch_value = pytorch_tensor[key].tolist() + tensorflow_value = tensorflow_tensor[key].numpy().tolist() + encoded_value = encoded_sequences[key] + + self.assertEqual(pytorch_value, tensorflow_value, encoded_value) + + def test_sequence_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + if not tokenizer.is_fast: + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + seq_0 = "Test this method." + seq_1 = ["With", "these", "inputs."] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))] + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardless of whether the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(seq_0.split(), boxes=boxes) + self.assertIn(0, output.sequence_ids()) + + output = tokenizer(seq_0, seq_1, boxes=boxes) + self.assertIn(0, output.sequence_ids()) + self.assertIn(1, output.sequence_ids()) + + if tokenizer.num_special_tokens_to_add(pair=True): + self.assertIn(None, output.sequence_ids()) + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + words = "Hey this is a token".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + r_output = tokenizer_r.encode(words, boxes=boxes) + + special_token_id = tokenizer_r.encode( + [""], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False + )[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + words = "Hey this is a token".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + p_output = tokenizer_p.encode(words, boxes=boxes) + cr_output = tokenizer_cr.encode(words, boxes=boxes) + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + def test_training_new_tokenizer(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + return + + tokenizer = self.get_rust_tokenizer() + new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) + + # Test we can use the new tokenizer with something not seen during training + text = [["this", "is", "the"], ["how", "are", "you"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]] + inputs = new_tokenizer(text, boxes=boxes) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = "this is the" + + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) + self.assertEqual(expected_result, decoded_input) + + # We check that the parameters of the tokenizer remained the same + # Check we have the same number of added_tokens for both pair and non-pair inputs. + self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False)) + self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True)) + + # Check we have the correct max_length for both pair and non-pair inputs. + self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence) + self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair) + + # Assert the set of special tokens match as we didn't ask to change them + self.assertSequenceEqual( + tokenizer.all_special_tokens_extended, + new_tokenizer.all_special_tokens_extended, + ) + + self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map) + + def test_training_new_tokenizer_with_special_tokens_change(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + return + + tokenizer = self.get_rust_tokenizer() + # Test with a special tokens map + class_signature = inspect.signature(tokenizer.__class__) + if "cls_token" in class_signature.parameters: + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: ""} + ) + cls_id = new_tokenizer.get_vocab()[""] + self.assertEqual(new_tokenizer.cls_token, "") + self.assertEqual(new_tokenizer.cls_token_id, cls_id) + + # Create a new mapping from the special tokens defined in the original tokenizer + special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy() + special_tokens_list.remove("additional_special_tokens") + special_tokens_map = {} + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, f"_{token}") is not None: + special_token = getattr(tokenizer, token) + special_tokens_map[special_token] = f"{special_token}a" + + # Train new tokenizer + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map + ) + + # Check the changes + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, f"_{token}") is None: + continue + special_token = getattr(tokenizer, token) + if special_token in special_tokens_map: + new_special_token = getattr(new_tokenizer, token) + self.assertEqual(special_tokens_map[special_token], new_special_token) + + new_id = new_tokenizer.get_vocab()[new_special_token] + self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id) + + # Check if the AddedToken / string format has been kept + for special_token in tokenizer.all_special_tokens_extended: + if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + elif isinstance(special_token, AddedToken): + # The special token must appear in the list of the new tokenizer as an object of type AddedToken with + # the same parameters as the old AddedToken except the content that the user has requested to change. + special_token_str = special_token.content + new_special_token_str = special_tokens_map[special_token_str] + + find = False + for candidate in new_tokenizer.all_special_tokens_extended: + if ( + isinstance(candidate, AddedToken) + and candidate.content == new_special_token_str + and candidate.lstrip == special_token.lstrip + and candidate.rstrip == special_token.rstrip + and candidate.normalized == special_token.normalized + and candidate.single_word == special_token.single_word + ): + find = True + break + self.assertTrue( + find, + ( + f"'{new_special_token_str}' doesn't appear in the list " + f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as " + f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}" + ), + ) + elif special_token not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + + else: + # The special token must appear in the list of the new tokenizer as an object of type string. + self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended) + + # Test we can use the new tokenizer with something not seen during training + words = [["this", "is"], ["hello", "🤗"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]] + inputs = new_tokenizer(words, boxes=boxes) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = "this is" + + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) + self.assertEqual(expected_result, decoded_input) + + def test_prepare_for_model(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + # only test prepare_for_model for the slow tokenizer + if tokenizer.__class__.__name__ == "LayoutXLMTokenizerFast": + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True) + + input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True) + + self.assertEqual(input_dict, prepared_input_dict) + + def test_padding_different_model_input_name(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + words, boxes = self.get_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes) + input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes) + + # rename encoded batch to "inputs" + input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]] + del input_r[tokenizer_r.model_input_names[0]] + + input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]] + del input_p[tokenizer_p.model_input_names[0]] + + # Renaming `input_ids` to `inputs` + tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:] + tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:] + + input_r = tokenizer_r.pad(input_r, padding="longest") + input_p = tokenizer_r.pad(input_p, padding="longest") + + max_length = len(input_p["inputs"][0]) + self.assert_batch_padded_input_match( + input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs" + ) + + def test_batch_encode_dynamic_overflowing(self): + """ + When calling batch_encode with multiple sequences, it can return different number of + overflowing encoding for each sequence: + [ + Sequence 1: [Encoding 1, Encoding 2], + Sequence 2: [Encoding 1], + Sequence 3: [Encoding 1, Encoding 2, ... Encoding N] + ] + This needs to be padded so that it can represented as a tensor + """ + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): + + if is_torch_available(): + returned_tensor = "pt" + elif is_tf_available(): + returned_tensor = "tf" + else: + returned_tensor = "jax" + + # Single example + words, boxes = self.get_words_and_boxes() + tokens = tokenizer.encode_plus( + words, + boxes=boxes, + max_length=6, + padding=True, + truncation=True, + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + if key != "bbox": + self.assertEqual(len(tokens[key].shape), 2) + else: + self.assertEqual(len(tokens[key].shape), 3) + + # Batch of examples + # For these 2 examples, 3 training examples will be created + words, boxes = self.get_words_and_boxes_batch() + tokens = tokenizer.batch_encode_plus( + words, + boxes=boxes, + max_length=6, + padding=True, + truncation="only_first", + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + if key != "bbox": + self.assertEqual(len(tokens[key].shape), 2) + self.assertEqual(tokens[key].shape[-1], 6) + else: + self.assertEqual(len(tokens[key].shape), 3) + self.assertEqual(tokens[key].shape[-1], 4) + + @unittest.skip("TO DO: overwrite this very extensive test.") + def test_alignement_methods(self): + pass + + @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.") + def test_maximum_encoding_length_pair_input(self): + pass + + @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.") + def test_maximum_encoding_length_single_input(self): + pass + + @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.") + def test_pretokenized_inputs(self): + pass + + @unittest.skip("layoutxlm tokenizer always expects pretokenized inputs.") + def test_compare_pretokenized_inputs(self): + pass + + @unittest.skip("layoutxlm fast tokenizer does not support prepare_for_model") + def test_compare_prepare_for_model(self): + pass + + @slow + def test_only_label_first_subword(self): + words = ["hello", "niels"] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + word_labels = [0, 1] + + # test slow tokenizer + tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100]) + + tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False) + encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100]) + + # test fast tokenizer + tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") + encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100]) + + tokenizer_r = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False) + encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100]) + + @slow + def test_layoutxlm_integration_test(self): + + tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") + + # There are 3 cases: + # CASE 1: document image classification (training + inference), document image token classification (inference), + # in which case only words and normalized bounding boxes are provided to the tokenizer + # CASE 2: document image token classification (training), + # in which case one also provides word labels to the tokenizer + # CASE 3: document image visual question answering (inference), + # in which case one also provides a question to the tokenizer + + # We need to test all 3 cases both on batched and non-batched inputs. + + # CASE 1: not batched + words, boxes = self.get_words_and_boxes() + + # fmt: off + expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # noqa: E231 + # fmt: on + + encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 1: batched + words, boxes = self.get_words_and_boxes_batch() + + # fmt: off + expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E231 + # fmt: on + + encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 2: not batched + words, boxes = self.get_words_and_boxes() + word_labels = [1, 2, 3] + + # fmt: off + expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # noqa: E231 + # fmt: on + + encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 2: batched + words, boxes = self.get_words_and_boxes_batch() + word_labels = [[1, 2, 3], [2, 46, 17, 22, 3]] + + # fmt: off + expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, -100, 46, 17, 22, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E231 + # fmt: on + + encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 3: not batched + question, words, boxes = self.get_question_words_and_boxes() + + # fmt: off + expected_results = {'input_ids': [0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]} # noqa: E231 + # fmt: on + + encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 3: batched + questions, words, boxes = self.get_question_words_and_boxes_batch() + + # fmt: off + expected_results = {'input_ids': [[0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], [0, 3642, 83, 764, 35839, 32, 2, 2, 2367, 10, 21, 3190, 53496, 19, 2, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]} # noqa: E231 + # fmt: on + + encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + @unittest.skip("Doesn't support another framework than PyTorch") + def test_np_encode_plus_sent_to_model(self): + pass