Add LayoutXLMProcessor (and LayoutXLMTokenizer, LayoutXLMTokenizerFast) (#14115)

* Add LayoutXLMTokenizer and LayoutXLMTokenizerFast

* Fix styling issues

* Fix more styling issues

* Fix more styling issues

* Fix docstring

* Fix unit tests

* Fix docs

* Fix unit tests

* Fix typos and styling issues

* Fix styling issues

* Fix docstring

* Make all tests of test_tokenization_layoutxlm pass

* Add LayoutXLMProcessor

* Make fixup

* Make all LayoutXLMProcessor tests pass

* Minor fixes

* Leave LayoutLMv2Processor tests unchanged

* Fix code quality

* Move LayoutXLM tokenizers and processor to separate folder

* Fix code quality

* Apply suggestions from code review

* Replace assertions by value errors

* Remove methods from fast tokenizer

Co-authored-by: King Yiu Suen <kingyiusuen@gmail.com>
This commit is contained in:
NielsRogge
2021-11-03 08:59:44 +01:00
committed by GitHub
parent 558f8543ba
commit 5f789a687a
14 changed files with 4294 additions and 4 deletions

View File

@@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can Note that LayoutXLM has its own tokenizer, based on
initialize it as follows: :class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as
follows:
.. code-block:: .. code-block::
from transformers import AutoTokenizer from transformers import LayoutXLMTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')
Similar to LayoutLMv2, you can use :class:`~transformers.LayoutXLMProcessor` (which internally applies
:class:`~transformers.LayoutLMv2FeatureExtractor` and
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast` in sequence) to prepare all
data for the model.
As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
<layoutlmv2>` for all tips, code examples and notebooks. <layoutlmv2>` for all tips, code examples and notebooks.
This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
<https://github.com/microsoft/unilm>`__. <https://github.com/microsoft/unilm>`__.
LayoutXLMTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LayoutXLMTokenizer
:members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
LayoutXLMTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LayoutXLMTokenizerFast
:members: __call__
LayoutXLMProcessor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LayoutXLMProcessor
:members: __call__

View File

@@ -229,6 +229,7 @@ _import_structure = {
"LayoutLMv2Processor", "LayoutLMv2Processor",
"LayoutLMv2Tokenizer", "LayoutLMv2Tokenizer",
], ],
"models.layoutxlm": ["LayoutXLMProcessor"],
"models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"], "models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
"models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"], "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
"models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"], "models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
@@ -365,6 +366,7 @@ if is_sentencepiece_available():
_import_structure["models.big_bird"].append("BigBirdTokenizer") _import_structure["models.big_bird"].append("BigBirdTokenizer")
_import_structure["models.camembert"].append("CamembertTokenizer") _import_structure["models.camembert"].append("CamembertTokenizer")
_import_structure["models.deberta_v2"].append("DebertaV2Tokenizer") _import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizer")
_import_structure["models.m2m_100"].append("M2M100Tokenizer") _import_structure["models.m2m_100"].append("M2M100Tokenizer")
_import_structure["models.marian"].append("MarianTokenizer") _import_structure["models.marian"].append("MarianTokenizer")
_import_structure["models.mbart"].append("MBartTokenizer") _import_structure["models.mbart"].append("MBartTokenizer")
@@ -411,6 +413,7 @@ if is_tokenizers_available():
_import_structure["models.herbert"].append("HerbertTokenizerFast") _import_structure["models.herbert"].append("HerbertTokenizerFast")
_import_structure["models.layoutlm"].append("LayoutLMTokenizerFast") _import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
_import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast") _import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizerFast")
_import_structure["models.led"].append("LEDTokenizerFast") _import_structure["models.led"].append("LEDTokenizerFast")
_import_structure["models.longformer"].append("LongformerTokenizerFast") _import_structure["models.longformer"].append("LongformerTokenizerFast")
_import_structure["models.lxmert"].append("LxmertTokenizerFast") _import_structure["models.lxmert"].append("LxmertTokenizerFast")
@@ -477,6 +480,7 @@ if is_vision_available():
_import_structure["models.detr"].append("DetrFeatureExtractor") _import_structure["models.detr"].append("DetrFeatureExtractor")
_import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor") _import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor")
_import_structure["models.layoutlmv2"].append("LayoutLMv2Processor") _import_structure["models.layoutlmv2"].append("LayoutLMv2Processor")
_import_structure["models.layoutxlm"].append("LayoutXLMProcessor")
_import_structure["models.segformer"].append("SegformerFeatureExtractor") _import_structure["models.segformer"].append("SegformerFeatureExtractor")
_import_structure["models.vit"].append("ViTFeatureExtractor") _import_structure["models.vit"].append("ViTFeatureExtractor")
else: else:
@@ -2140,6 +2144,7 @@ if TYPE_CHECKING:
LayoutLMv2Processor, LayoutLMv2Processor,
LayoutLMv2Tokenizer, LayoutLMv2Tokenizer,
) )
from .models.layoutxlm import LayoutXLMProcessor
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
@@ -2266,6 +2271,7 @@ if TYPE_CHECKING:
from .models.big_bird import BigBirdTokenizer from .models.big_bird import BigBirdTokenizer
from .models.camembert import CamembertTokenizer from .models.camembert import CamembertTokenizer
from .models.deberta_v2 import DebertaV2Tokenizer from .models.deberta_v2 import DebertaV2Tokenizer
from .models.layoutxlm import LayoutXLMTokenizer
from .models.m2m_100 import M2M100Tokenizer from .models.m2m_100 import M2M100Tokenizer
from .models.marian import MarianTokenizer from .models.marian import MarianTokenizer
from .models.mbart import MBart50Tokenizer, MBartTokenizer from .models.mbart import MBart50Tokenizer, MBartTokenizer
@@ -2302,6 +2308,7 @@ if TYPE_CHECKING:
from .models.herbert import HerbertTokenizerFast from .models.herbert import HerbertTokenizerFast
from .models.layoutlm import LayoutLMTokenizerFast from .models.layoutlm import LayoutLMTokenizerFast
from .models.layoutlmv2 import LayoutLMv2TokenizerFast from .models.layoutlmv2 import LayoutLMv2TokenizerFast
from .models.layoutxlm import LayoutXLMTokenizerFast
from .models.led import LEDTokenizerFast from .models.led import LEDTokenizerFast
from .models.longformer import LongformerTokenizerFast from .models.longformer import LongformerTokenizerFast
from .models.lxmert import LxmertTokenizerFast from .models.lxmert import LxmertTokenizerFast
@@ -2349,6 +2356,7 @@ if TYPE_CHECKING:
from .models.deit import DeiTFeatureExtractor from .models.deit import DeiTFeatureExtractor
from .models.detr import DetrFeatureExtractor from .models.detr import DetrFeatureExtractor
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
from .models.layoutxlm import LayoutXLMProcessor
from .models.segformer import SegformerFeatureExtractor from .models.segformer import SegformerFeatureExtractor
from .models.vit import ViTFeatureExtractor from .models.vit import ViTFeatureExtractor
else: else:

View File

@@ -944,6 +944,7 @@ SLOW_TO_FAST_CONVERTERS = {
"HerbertTokenizer": HerbertConverter, "HerbertTokenizer": HerbertConverter,
"LayoutLMTokenizer": BertConverter, "LayoutLMTokenizer": BertConverter,
"LayoutLMv2Tokenizer": BertConverter, "LayoutLMv2Tokenizer": BertConverter,
"LayoutXLMTokenizer": XLMRobertaConverter,
"LongformerTokenizer": RobertaConverter, "LongformerTokenizer": RobertaConverter,
"LEDTokenizer": RobertaConverter, "LEDTokenizer": RobertaConverter,
"LxmertTokenizer": BertConverter, "LxmertTokenizer": BertConverter,

View File

@@ -59,6 +59,7 @@ from . import (
ibert, ibert,
layoutlm, layoutlm,
layoutlmv2, layoutlmv2,
layoutxlm,
led, led,
longformer, longformer,
luke, luke,

View File

@@ -124,6 +124,7 @@ else:
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
( (
"dpr", "dpr",
( (

View File

@@ -0,0 +1,54 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...file_utils import (
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {}
if is_sentencepiece_available():
_import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"]
if is_tokenizers_available():
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]
if is_vision_available():
_import_structure["processing_layoutxlm"] = ["LayoutXLMProcessor"]
if TYPE_CHECKING:
if is_sentencepiece_available():
from .tokenization_layoutxlm import LayoutXLMTokenizer
if is_tokenizers_available():
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
if is_vision_available():
from .processing_layoutlmv2 import LayoutXLMProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

View File

@@ -0,0 +1,207 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LayoutXLM.
"""
from typing import List, Optional, Union
from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
from ...file_utils import TensorType
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from .tokenization_layoutxlm import LayoutXLMTokenizer
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
class LayoutXLMProcessor:
r"""
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
single processor.
:class:`~transformers.LayoutXLMProcessor` offers all the functionalities you need to prepare data for the model.
It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
optionally applies OCR to get words and normalized bounding boxes. These are then provided to
:class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`, which turns the words
and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
input.
tokenizer (:obj:`LayoutXLMTokenizer` or :obj:`LayoutXLMTokenizerFast`):
An instance of :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`.
The tokenizer is a required input.
"""
def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)):
raise ValueError(
f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
def save_pretrained(self, save_directory):
"""
Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory ``save_directory``,
so that it can be re-loaded using the :func:`~transformers.LayoutXLMProcessor.from_pretrained` class method.
.. note::
This class method is simply calling
:meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
docstrings of the methods above for more information.
Args:
save_directory (:obj:`str` or :obj:`os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
r"""
Instantiate a :class:`~transformers.LayoutXLMProcessor` from a pretrained LayoutXLM processor.
.. note::
This class method is simply calling Layoutv2FeatureExtractor's
:meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
LayoutXLMTokenizerFast's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
Please refer to the docstrings of the methods above for more information.
Args:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
This can be either:
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a feature extractor file saved using the
:meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
``./my_model_directory/``.
- a path or url to a saved feature extractor JSON `file`, e.g.,
``./my_model_directory/preprocessor_config.json``.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to instantiate a fast tokenizer.
**kwargs
Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
:class:`~transformers.PreTrainedTokenizer`
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchEncoding:
"""
This method first forwards the :obj:`images` argument to
:meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the output, together
with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
along with the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the
output, together with resized :obj:`images`.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels "
"if you initialized the feature extractor with apply_ocr set to True."
)
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
text=text if text is not None else features["words"],
text_pair=text_pair if text_pair is not None else None,
boxes=boxes if boxes is not None else features["boxes"],
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
# add pixel values
encoded_inputs["image"] = features.pop("pixel_values")
return encoded_inputs

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,694 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for LayoutXLM model."""
import os
from shutil import copyfile
from typing import Dict, List, Optional, Tuple, Union
from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available
from ...tokenization_utils import AddedToken
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
BatchEncoding,
EncodedInput,
PreTokenizedInput,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from ..xlm_roberta.tokenization_xlm_roberta import (
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
VOCAB_FILES_NAMES,
)
if is_sentencepiece_available():
from .tokenization_layoutxlm import LayoutXLMTokenizer
else:
LayoutXLMTokenizer = None
logger = logging.get_logger(__name__)
class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
The bounding box to use for the special [CLS] token.
sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
The bounding box to use for the special [SEP] token.
pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
The bounding box to use for the special [PAD] token.
pad_token_label (:obj:`int`, `optional`, defaults to -100):
The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
CrossEntropyLoss.
only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to only label the first subword, in case word labels are provided.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = LayoutXLMTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
cls_token_box=[0, 0, 0, 0],
sep_token_box=[1000, 1000, 1000, 1000],
pad_token_box=[0, 0, 0, 0],
pad_token_label=-100,
only_label_first_subword=True,
**kwargs
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
cls_token_box=cls_token_box,
sep_token_box=sep_token_box,
pad_token_box=pad_token_box,
pad_token_label=pad_token_label,
only_label_first_subword=only_label_first_subword,
**kwargs,
)
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True
# additional properties
self.cls_token_box = cls_token_box
self.sep_token_box = sep_token_box
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
sequences with word-level normalized bounding boxes and optional labels.
Args:
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
(words of a single example or questions of a batch of examples) or a list of list of strings (batch of
words).
text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
(pretokenized string).
boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
Word-level integer labels (for token classification tasks such as FUNSD, CORD).
"""
# Input type checking for clearer error
def _is_valid_text_input(t):
if isinstance(t, str):
# Strings are fine
return True
elif isinstance(t, (list, tuple)):
# List are fine as long as they are...
if len(t) == 0:
# ... empty
return True
elif isinstance(t[0], str):
# ... list of strings
return True
elif isinstance(t[0], (list, tuple)):
# ... list with an empty list or with a list of strings
return len(t[0]) == 0 or isinstance(t[0][0], str)
else:
return False
else:
return False
if text_pair is not None:
# in case text + text_pair are provided, text = questions, text_pair = words
if not _is_valid_text_input(text):
raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
if not isinstance(text_pair, (list, tuple)):
raise ValueError(
"words must of type `List[str]` (single pretokenized example), "
"or `List[List[str]]` (batch of pretokenized examples)."
)
else:
# in case only text is provided => must be words
if not isinstance(text, (list, tuple)):
raise ValueError(
"Words must of type `List[str]` (single pretokenized example), "
"or `List[List[str]]` (batch of pretokenized examples)."
)
if text_pair is not None:
is_batched = isinstance(text, (list, tuple))
else:
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
words = text if text_pair is None else text_pair
if boxes is None:
raise ValueError("You must provide corresponding bounding boxes")
if is_batched:
if len(words) != len(boxes):
raise ValueError("You must provide words and boxes for an equal amount of examples")
for words_example, boxes_example in zip(words, boxes):
if len(words_example) != len(boxes_example):
raise ValueError("You must provide as many words as there are bounding boxes")
else:
if len(words) != len(boxes):
raise ValueError("You must provide as many words as there are bounding boxes")
if is_batched:
if text_pair is not None and len(text) != len(text_pair):
raise ValueError(
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
)
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
is_pair = bool(text_pair is not None)
return self.batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
return self.encode_plus(
text=text,
text_pair=text_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
batched_input = [(text, pair)] if pair else [text]
encodings = self._tokenizer.encode_batch(
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
)
return encodings[0].tokens
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[List[List[int]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if not isinstance(batch_text_or_text_pairs, list):
raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
# Set the truncation and padding strategy and restore the initial configuration
self.set_truncation_and_padding(
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
)
if is_pair:
batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
encodings = self._tokenizer.encode_batch(
batch_text_or_text_pairs,
add_special_tokens=add_special_tokens,
is_pretokenized=True, # we set this to True as LayoutLMv2 always expects pretokenized inputs
)
# Convert encoding to dict
# `Tokens` has type: Tuple[
# List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
# List[EncodingFast]
# ]
# with nested dimensions corresponding to batch, overflows, sequence length
tokens_and_encodings = [
self._convert_encoding(
encoding=encoding,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=True
if word_labels is not None
else return_offsets_mapping, # we use offsets to create the labels
return_length=return_length,
verbose=verbose,
)
for encoding in encodings
]
# Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
# From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
# (we say ~ because the number of overflow varies with the example in the batch)
#
# To match each overflowing sample with the original sample in the batch
# we add an overflow_to_sample_mapping array (see below)
sanitized_tokens = {}
for key in tokens_and_encodings[0][0].keys():
stack = [e for item, _ in tokens_and_encodings for e in item[key]]
sanitized_tokens[key] = stack
sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
# If returning overflowing tokens, we need to return a mapping
# from the batch idx to the original sample
if return_overflowing_tokens:
overflow_to_sample_mapping = []
for i, (toks, _) in enumerate(tokens_and_encodings):
overflow_to_sample_mapping += [i] * len(toks["input_ids"])
sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
for input_ids in sanitized_tokens["input_ids"]:
self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
# create the token boxes
token_boxes = []
for batch_index in range(len(sanitized_tokens["input_ids"])):
if return_overflowing_tokens:
original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
else:
original_index = batch_index
token_boxes_example = []
for id, sequence_id, word_id in zip(
sanitized_tokens["input_ids"][batch_index],
sanitized_encodings[batch_index].sequence_ids,
sanitized_encodings[batch_index].word_ids,
):
if word_id is not None:
if is_pair and sequence_id == 0:
token_boxes_example.append(self.pad_token_box)
else:
token_boxes_example.append(boxes[original_index][word_id])
else:
if id == self.cls_token_id:
token_boxes_example.append(self.cls_token_box)
elif id == self.sep_token_id:
token_boxes_example.append(self.sep_token_box)
elif id == self.pad_token_id:
token_boxes_example.append(self.pad_token_box)
else:
raise ValueError("Id not recognized")
token_boxes.append(token_boxes_example)
sanitized_tokens["bbox"] = token_boxes
# optionally, create the labels
if word_labels is not None:
labels = []
for batch_index in range(len(sanitized_tokens["input_ids"])):
if return_overflowing_tokens:
original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
else:
original_index = batch_index
labels_example = []
for id, offset, word_id in zip(
sanitized_tokens["input_ids"][batch_index],
sanitized_tokens["offset_mapping"][batch_index],
sanitized_encodings[batch_index].word_ids,
):
if word_id is not None:
if self.only_label_first_subword:
if offset[0] == 0:
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
labels_example.append(word_labels[original_index][word_id])
else:
labels_example.append(self.pad_token_label)
else:
labels_example.append(word_labels[original_index][word_id])
else:
labels_example.append(self.pad_token_label)
labels.append(labels_example)
sanitized_tokens["labels"] = labels
# finally, remove offsets if the user didn't want them
if not return_offsets_mapping:
del sanitized_tokens["offset_mapping"]
return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
# make it a batched input
# 2 options:
# 1) only text, in case text must be a list of str
# 2) text + text_pair, in which case text = str and text_pair a list of str
batched_input = [(text, text_pair)] if text_pair else [text]
batched_boxes = [boxes]
batched_word_labels = [word_labels] if word_labels is not None else None
batched_output = self._batch_encode_plus(
batched_input,
is_pair=bool(text_pair is not None),
boxes=batched_boxes,
word_labels=batched_word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# Return tensor is None, then we can remove the leading batch axis
# Overflowing tokens are returned as a batch of output so we keep them in this case
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
for key, value in batched_output.items()
},
batched_output.encodings,
)
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
return batched_output
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
Args:
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
# Initialize attention mask if not present.
if return_attention_mask and "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * len(required_input)
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
)
if "bbox" in encoded_inputs:
encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
if "labels" in encoded_inputs:
encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
elif self.padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
"token_type_ids"
]
if "bbox" in encoded_inputs:
encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
if "labels" in encoded_inputs:
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"]
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
return encoded_inputs
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)

View File

@@ -65,6 +65,15 @@ class DebertaV2Tokenizer:
requires_backends(cls, ["sentencepiece"]) requires_backends(cls, ["sentencepiece"])
class LayoutXLMTokenizer:
def __init__(self, *args, **kwargs):
requires_backends(self, ["sentencepiece"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["sentencepiece"])
class M2M100Tokenizer: class M2M100Tokenizer:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["sentencepiece"]) requires_backends(self, ["sentencepiece"])

View File

@@ -200,6 +200,15 @@ class LayoutLMv2TokenizerFast:
requires_backends(cls, ["tokenizers"]) requires_backends(cls, ["tokenizers"])
class LayoutXLMTokenizerFast:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tokenizers"])
class LEDTokenizerFast: class LEDTokenizerFast:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"]) requires_backends(self, ["tokenizers"])

View File

@@ -50,6 +50,15 @@ class LayoutLMv2Processor:
requires_backends(cls, ["vision"]) requires_backends(cls, ["vision"])
class LayoutXLMProcessor:
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["vision"])
class SegformerFeatureExtractor: class SegformerFeatureExtractor:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])

View File

@@ -0,0 +1,423 @@
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from typing import List
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.file_utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
from transformers.models.layoutxlm import LayoutXLMTokenizer, LayoutXLMTokenizerFast
from transformers.testing_utils import (
require_pytesseract,
require_sentencepiece,
require_tokenizers,
require_torch,
slow,
)
if is_pytesseract_available():
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutXLMProcessor
SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_pytesseract
@require_sentencepiece
@require_tokenizers
class LayoutXLMProcessorTest(unittest.TestCase):
tokenizer_class = LayoutXLMTokenizer
rust_tokenizer_class = LayoutXLMTokenizerFast
def setUp(self):
feature_extractor_map = {
"do_resize": True,
"size": 224,
"apply_ocr": True,
}
self.tmpdirname = tempfile.mkdtemp()
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(SAMPLE_SP, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(SAMPLE_SP, **kwargs)
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
def get_feature_extractor(self, **kwargs):
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
feature_extractor = self.get_feature_extractor()
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = LayoutXLMProcessor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
processor.save_pretrained(self.tmpdirname)
# slow tokenizer
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
processor = LayoutXLMProcessor.from_pretrained(
self.tmpdirname,
use_fast=False,
bos_token="(BOS)",
eos_token="(EOS)",
do_resize=False,
size=30,
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
# fast tokenizer
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
processor = LayoutXLMProcessor.from_pretrained(
self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
# different use cases tests
@require_sentencepiece
@require_torch
@require_pytesseract
class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
@cached_property
def get_images(self):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
return image_1, image_2
@cached_property
def get_tokenizers(self):
slow_tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
fast_tokenizer = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
return [slow_tokenizer, fast_tokenizer]
@slow
def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
feature_extractor = LayoutLMv2FeatureExtractor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify image
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids
# fmt: off
expected_decoding = "<s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
input_feat_extract = feature_extractor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify images
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids
# fmt: off
expected_decoding = "<s> 7 ITC Limited REPORT AND ACCOUNTS 2013 ITCs Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITCs value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITCs brands national assets, adding to Indias competitiveness. It is ITCs aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
@slow
def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["input_ids", "bbox", "attention_mask", "image"]
actual_keys = list(input_processor.keys())
for key in expected_keys:
self.assertIn(key, actual_keys)
# verify input_ids
expected_decoding = "<s> hello world</s>"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "<s> hello world</s><pad><pad>"
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[0, 0, 0, 0],
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
words = ["weirdly", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
word_labels = [1, 2]
input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "<s> weirdly world</s>"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify labels
expected_labels = [-100, 1, -100, 2, -100]
self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
word_labels = [[1, 2], [6, 3, 10, 2]]
input_processor = processor(
images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "<s> my name is niels</s>"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[0, 0, 0, 0],
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
# verify labels
expected_labels = [-100, 6, 3, 10, 2, -100, -100]
self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
@slow
def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True
feature_extractor = LayoutLMv2FeatureExtractor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
input_processor = processor(images[0], question, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# fmt: off
expected_decoding = "<s> What's his name?</s></s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
input_processor = processor(
images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "<s> what's the time</s></s> 7 ITC Limited REPORT AND ACCOUNTS 2013</s>"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
# fmt: off
expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [1000, 1000, 1000, 1000]] # noqa: E231
# fmt: on
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "<s> What's his name?</s></s> hello world</s>"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "<s> How old is he?</s></s> hello world</s><pad><pad>"
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
expected_decoding = "<s> what's the time</s></s> my name is niels</s>"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)

File diff suppressed because it is too large Load Diff