Add LayoutXLMProcessor (and LayoutXLMTokenizer, LayoutXLMTokenizerFast) (#14115)
* Add LayoutXLMTokenizer and LayoutXLMTokenizerFast * Fix styling issues * Fix more styling issues * Fix more styling issues * Fix docstring * Fix unit tests * Fix docs * Fix unit tests * Fix typos and styling issues * Fix styling issues * Fix docstring * Make all tests of test_tokenization_layoutxlm pass * Add LayoutXLMProcessor * Make fixup * Make all LayoutXLMProcessor tests pass * Minor fixes * Leave LayoutLMv2Processor tests unchanged * Fix code quality * Move LayoutXLM tokenizers and processor to separate folder * Fix code quality * Apply suggestions from code review * Replace assertions by value errors * Remove methods from fast tokenizer Co-authored-by: King Yiu Suen <kingyiusuen@gmail.com>
This commit is contained in:
@@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
|
|||||||
|
|
||||||
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
|
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
|
||||||
|
|
||||||
Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
|
Note that LayoutXLM has its own tokenizer, based on
|
||||||
initialize it as follows:
|
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as
|
||||||
|
follows:
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import LayoutXLMTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base')
|
tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')
|
||||||
|
|
||||||
|
Similar to LayoutLMv2, you can use :class:`~transformers.LayoutXLMProcessor` (which internally applies
|
||||||
|
:class:`~transformers.LayoutLMv2FeatureExtractor` and
|
||||||
|
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast` in sequence) to prepare all
|
||||||
|
data for the model.
|
||||||
|
|
||||||
As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
|
As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
|
||||||
<layoutlmv2>` for all tips, code examples and notebooks.
|
<layoutlmv2>` for all tips, code examples and notebooks.
|
||||||
|
|
||||||
This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
|
This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
|
||||||
<https://github.com/microsoft/unilm>`__.
|
<https://github.com/microsoft/unilm>`__.
|
||||||
|
|
||||||
|
|
||||||
|
LayoutXLMTokenizer
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.LayoutXLMTokenizer
|
||||||
|
:members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
|
LayoutXLMTokenizerFast
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.LayoutXLMTokenizerFast
|
||||||
|
:members: __call__
|
||||||
|
|
||||||
|
|
||||||
|
LayoutXLMProcessor
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.LayoutXLMProcessor
|
||||||
|
:members: __call__
|
||||||
|
|||||||
@@ -229,6 +229,7 @@ _import_structure = {
|
|||||||
"LayoutLMv2Processor",
|
"LayoutLMv2Processor",
|
||||||
"LayoutLMv2Tokenizer",
|
"LayoutLMv2Tokenizer",
|
||||||
],
|
],
|
||||||
|
"models.layoutxlm": ["LayoutXLMProcessor"],
|
||||||
"models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
|
"models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
|
||||||
"models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
|
"models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
|
||||||
"models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
|
"models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
|
||||||
@@ -365,6 +366,7 @@ if is_sentencepiece_available():
|
|||||||
_import_structure["models.big_bird"].append("BigBirdTokenizer")
|
_import_structure["models.big_bird"].append("BigBirdTokenizer")
|
||||||
_import_structure["models.camembert"].append("CamembertTokenizer")
|
_import_structure["models.camembert"].append("CamembertTokenizer")
|
||||||
_import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
|
_import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
|
||||||
|
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizer")
|
||||||
_import_structure["models.m2m_100"].append("M2M100Tokenizer")
|
_import_structure["models.m2m_100"].append("M2M100Tokenizer")
|
||||||
_import_structure["models.marian"].append("MarianTokenizer")
|
_import_structure["models.marian"].append("MarianTokenizer")
|
||||||
_import_structure["models.mbart"].append("MBartTokenizer")
|
_import_structure["models.mbart"].append("MBartTokenizer")
|
||||||
@@ -411,6 +413,7 @@ if is_tokenizers_available():
|
|||||||
_import_structure["models.herbert"].append("HerbertTokenizerFast")
|
_import_structure["models.herbert"].append("HerbertTokenizerFast")
|
||||||
_import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
|
_import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
|
||||||
_import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
|
_import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
|
||||||
|
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizerFast")
|
||||||
_import_structure["models.led"].append("LEDTokenizerFast")
|
_import_structure["models.led"].append("LEDTokenizerFast")
|
||||||
_import_structure["models.longformer"].append("LongformerTokenizerFast")
|
_import_structure["models.longformer"].append("LongformerTokenizerFast")
|
||||||
_import_structure["models.lxmert"].append("LxmertTokenizerFast")
|
_import_structure["models.lxmert"].append("LxmertTokenizerFast")
|
||||||
@@ -477,6 +480,7 @@ if is_vision_available():
|
|||||||
_import_structure["models.detr"].append("DetrFeatureExtractor")
|
_import_structure["models.detr"].append("DetrFeatureExtractor")
|
||||||
_import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor")
|
_import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor")
|
||||||
_import_structure["models.layoutlmv2"].append("LayoutLMv2Processor")
|
_import_structure["models.layoutlmv2"].append("LayoutLMv2Processor")
|
||||||
|
_import_structure["models.layoutxlm"].append("LayoutXLMProcessor")
|
||||||
_import_structure["models.segformer"].append("SegformerFeatureExtractor")
|
_import_structure["models.segformer"].append("SegformerFeatureExtractor")
|
||||||
_import_structure["models.vit"].append("ViTFeatureExtractor")
|
_import_structure["models.vit"].append("ViTFeatureExtractor")
|
||||||
else:
|
else:
|
||||||
@@ -2140,6 +2144,7 @@ if TYPE_CHECKING:
|
|||||||
LayoutLMv2Processor,
|
LayoutLMv2Processor,
|
||||||
LayoutLMv2Tokenizer,
|
LayoutLMv2Tokenizer,
|
||||||
)
|
)
|
||||||
|
from .models.layoutxlm import LayoutXLMProcessor
|
||||||
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
|
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
|
||||||
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
|
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
|
||||||
from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
|
from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
|
||||||
@@ -2266,6 +2271,7 @@ if TYPE_CHECKING:
|
|||||||
from .models.big_bird import BigBirdTokenizer
|
from .models.big_bird import BigBirdTokenizer
|
||||||
from .models.camembert import CamembertTokenizer
|
from .models.camembert import CamembertTokenizer
|
||||||
from .models.deberta_v2 import DebertaV2Tokenizer
|
from .models.deberta_v2 import DebertaV2Tokenizer
|
||||||
|
from .models.layoutxlm import LayoutXLMTokenizer
|
||||||
from .models.m2m_100 import M2M100Tokenizer
|
from .models.m2m_100 import M2M100Tokenizer
|
||||||
from .models.marian import MarianTokenizer
|
from .models.marian import MarianTokenizer
|
||||||
from .models.mbart import MBart50Tokenizer, MBartTokenizer
|
from .models.mbart import MBart50Tokenizer, MBartTokenizer
|
||||||
@@ -2302,6 +2308,7 @@ if TYPE_CHECKING:
|
|||||||
from .models.herbert import HerbertTokenizerFast
|
from .models.herbert import HerbertTokenizerFast
|
||||||
from .models.layoutlm import LayoutLMTokenizerFast
|
from .models.layoutlm import LayoutLMTokenizerFast
|
||||||
from .models.layoutlmv2 import LayoutLMv2TokenizerFast
|
from .models.layoutlmv2 import LayoutLMv2TokenizerFast
|
||||||
|
from .models.layoutxlm import LayoutXLMTokenizerFast
|
||||||
from .models.led import LEDTokenizerFast
|
from .models.led import LEDTokenizerFast
|
||||||
from .models.longformer import LongformerTokenizerFast
|
from .models.longformer import LongformerTokenizerFast
|
||||||
from .models.lxmert import LxmertTokenizerFast
|
from .models.lxmert import LxmertTokenizerFast
|
||||||
@@ -2349,6 +2356,7 @@ if TYPE_CHECKING:
|
|||||||
from .models.deit import DeiTFeatureExtractor
|
from .models.deit import DeiTFeatureExtractor
|
||||||
from .models.detr import DetrFeatureExtractor
|
from .models.detr import DetrFeatureExtractor
|
||||||
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
|
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
|
||||||
|
from .models.layoutxlm import LayoutXLMProcessor
|
||||||
from .models.segformer import SegformerFeatureExtractor
|
from .models.segformer import SegformerFeatureExtractor
|
||||||
from .models.vit import ViTFeatureExtractor
|
from .models.vit import ViTFeatureExtractor
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -944,6 +944,7 @@ SLOW_TO_FAST_CONVERTERS = {
|
|||||||
"HerbertTokenizer": HerbertConverter,
|
"HerbertTokenizer": HerbertConverter,
|
||||||
"LayoutLMTokenizer": BertConverter,
|
"LayoutLMTokenizer": BertConverter,
|
||||||
"LayoutLMv2Tokenizer": BertConverter,
|
"LayoutLMv2Tokenizer": BertConverter,
|
||||||
|
"LayoutXLMTokenizer": XLMRobertaConverter,
|
||||||
"LongformerTokenizer": RobertaConverter,
|
"LongformerTokenizer": RobertaConverter,
|
||||||
"LEDTokenizer": RobertaConverter,
|
"LEDTokenizer": RobertaConverter,
|
||||||
"LxmertTokenizer": BertConverter,
|
"LxmertTokenizer": BertConverter,
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ from . import (
|
|||||||
ibert,
|
ibert,
|
||||||
layoutlm,
|
layoutlm,
|
||||||
layoutlmv2,
|
layoutlmv2,
|
||||||
|
layoutxlm,
|
||||||
led,
|
led,
|
||||||
longformer,
|
longformer,
|
||||||
luke,
|
luke,
|
||||||
|
|||||||
@@ -124,6 +124,7 @@ else:
|
|||||||
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
|
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
|
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
|
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
(
|
(
|
||||||
"dpr",
|
"dpr",
|
||||||
(
|
(
|
||||||
|
|||||||
54
src/transformers/models/layoutxlm/__init__.py
Normal file
54
src/transformers/models/layoutxlm/__init__.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# flake8: noqa
|
||||||
|
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||||
|
# module, but to preserve other warnings. So, don't check this module at all.
|
||||||
|
|
||||||
|
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from ...file_utils import (
|
||||||
|
_LazyModule,
|
||||||
|
is_sentencepiece_available,
|
||||||
|
is_tokenizers_available,
|
||||||
|
is_torch_available,
|
||||||
|
is_vision_available,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_import_structure = {}
|
||||||
|
|
||||||
|
if is_sentencepiece_available():
|
||||||
|
_import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"]
|
||||||
|
|
||||||
|
if is_tokenizers_available():
|
||||||
|
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
_import_structure["processing_layoutxlm"] = ["LayoutXLMProcessor"]
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
if is_sentencepiece_available():
|
||||||
|
from .tokenization_layoutxlm import LayoutXLMTokenizer
|
||||||
|
|
||||||
|
if is_tokenizers_available():
|
||||||
|
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from .processing_layoutlmv2 import LayoutXLMProcessor
|
||||||
|
|
||||||
|
else:
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
|
||||||
207
src/transformers/models/layoutxlm/processing_layoutxlm.py
Normal file
207
src/transformers/models/layoutxlm/processing_layoutxlm.py
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Processor class for LayoutXLM.
|
||||||
|
"""
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
|
||||||
|
|
||||||
|
from ...file_utils import TensorType
|
||||||
|
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||||
|
from .tokenization_layoutxlm import LayoutXLMTokenizer
|
||||||
|
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutXLMProcessor:
|
||||||
|
r"""
|
||||||
|
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
|
||||||
|
single processor.
|
||||||
|
|
||||||
|
:class:`~transformers.LayoutXLMProcessor` offers all the functionalities you need to prepare data for the model.
|
||||||
|
|
||||||
|
It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
|
||||||
|
optionally applies OCR to get words and normalized bounding boxes. These are then provided to
|
||||||
|
:class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`, which turns the words
|
||||||
|
and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
|
||||||
|
Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
|
||||||
|
classification tasks (such as FUNSD, CORD).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
|
||||||
|
An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
|
||||||
|
input.
|
||||||
|
tokenizer (:obj:`LayoutXLMTokenizer` or :obj:`LayoutXLMTokenizerFast`):
|
||||||
|
An instance of :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`.
|
||||||
|
The tokenizer is a required input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
|
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
|
||||||
|
raise ValueError(
|
||||||
|
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
||||||
|
)
|
||||||
|
if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)):
|
||||||
|
raise ValueError(
|
||||||
|
f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.feature_extractor = feature_extractor
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory):
|
||||||
|
"""
|
||||||
|
Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory ``save_directory``,
|
||||||
|
so that it can be re-loaded using the :func:`~transformers.LayoutXLMProcessor.from_pretrained` class method.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This class method is simply calling
|
||||||
|
:meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
|
||||||
|
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
|
||||||
|
docstrings of the methods above for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
||||||
|
be created if it does not exist).
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.feature_extractor.save_pretrained(save_directory)
|
||||||
|
self.tokenizer.save_pretrained(save_directory)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
|
||||||
|
r"""
|
||||||
|
Instantiate a :class:`~transformers.LayoutXLMProcessor` from a pretrained LayoutXLM processor.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This class method is simply calling Layoutv2FeatureExtractor's
|
||||||
|
:meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
|
||||||
|
LayoutXLMTokenizerFast's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
|
||||||
|
Please refer to the docstrings of the methods above for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
This can be either:
|
||||||
|
|
||||||
|
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
|
||||||
|
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
|
||||||
|
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
|
||||||
|
- a path to a `directory` containing a feature extractor file saved using the
|
||||||
|
:meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
|
||||||
|
``./my_model_directory/``.
|
||||||
|
- a path or url to a saved feature extractor JSON `file`, e.g.,
|
||||||
|
``./my_model_directory/preprocessor_config.json``.
|
||||||
|
|
||||||
|
use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not to instantiate a fast tokenizer.
|
||||||
|
|
||||||
|
**kwargs
|
||||||
|
Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
|
||||||
|
:class:`~transformers.PreTrainedTokenizer`
|
||||||
|
"""
|
||||||
|
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
if use_fast:
|
||||||
|
tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
else:
|
||||||
|
tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
images,
|
||||||
|
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
||||||
|
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
|
||||||
|
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
|
||||||
|
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
|
||||||
|
add_special_tokens: bool = True,
|
||||||
|
padding: Union[bool, str, PaddingStrategy] = False,
|
||||||
|
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
stride: int = 0,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_token_type_ids: Optional[bool] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
return_overflowing_tokens: bool = False,
|
||||||
|
return_special_tokens_mask: bool = False,
|
||||||
|
return_offsets_mapping: bool = False,
|
||||||
|
return_length: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchEncoding:
|
||||||
|
"""
|
||||||
|
This method first forwards the :obj:`images` argument to
|
||||||
|
:meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
|
||||||
|
initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
|
||||||
|
the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the output, together
|
||||||
|
with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
|
||||||
|
set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
|
||||||
|
along with the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the
|
||||||
|
output, together with resized :obj:`images`.
|
||||||
|
|
||||||
|
Please refer to the docstring of the above two methods for more information.
|
||||||
|
"""
|
||||||
|
# verify input
|
||||||
|
if self.feature_extractor.apply_ocr and (boxes is not None):
|
||||||
|
raise ValueError(
|
||||||
|
"You cannot provide bounding boxes "
|
||||||
|
"if you initialized the feature extractor with apply_ocr set to True."
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.feature_extractor.apply_ocr and (word_labels is not None):
|
||||||
|
raise ValueError(
|
||||||
|
"You cannot provide word labels "
|
||||||
|
"if you initialized the feature extractor with apply_ocr set to True."
|
||||||
|
)
|
||||||
|
|
||||||
|
# first, apply the feature extractor
|
||||||
|
features = self.feature_extractor(images=images, return_tensors=return_tensors)
|
||||||
|
|
||||||
|
# second, apply the tokenizer
|
||||||
|
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
|
||||||
|
if isinstance(text, str):
|
||||||
|
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
|
||||||
|
text_pair = features["words"]
|
||||||
|
|
||||||
|
encoded_inputs = self.tokenizer(
|
||||||
|
text=text if text is not None else features["words"],
|
||||||
|
text_pair=text_pair if text_pair is not None else None,
|
||||||
|
boxes=boxes if boxes is not None else features["boxes"],
|
||||||
|
word_labels=word_labels,
|
||||||
|
add_special_tokens=add_special_tokens,
|
||||||
|
padding=padding,
|
||||||
|
truncation=truncation,
|
||||||
|
max_length=max_length,
|
||||||
|
stride=stride,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_token_type_ids=return_token_type_ids,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
return_overflowing_tokens=return_overflowing_tokens,
|
||||||
|
return_special_tokens_mask=return_special_tokens_mask,
|
||||||
|
return_offsets_mapping=return_offsets_mapping,
|
||||||
|
return_length=return_length,
|
||||||
|
verbose=verbose,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
# add pixel values
|
||||||
|
encoded_inputs["image"] = features.pop("pixel_values")
|
||||||
|
|
||||||
|
return encoded_inputs
|
||||||
1061
src/transformers/models/layoutxlm/tokenization_layoutxlm.py
Normal file
1061
src/transformers/models/layoutxlm/tokenization_layoutxlm.py
Normal file
File diff suppressed because it is too large
Load Diff
694
src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
Normal file
694
src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
Normal file
@@ -0,0 +1,694 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License
|
||||||
|
""" Tokenization classes for LayoutXLM model."""
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from shutil import copyfile
|
||||||
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
|
||||||
|
|
||||||
|
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available
|
||||||
|
from ...tokenization_utils import AddedToken
|
||||||
|
from ...tokenization_utils_base import (
|
||||||
|
ENCODE_KWARGS_DOCSTRING,
|
||||||
|
BatchEncoding,
|
||||||
|
EncodedInput,
|
||||||
|
PreTokenizedInput,
|
||||||
|
TextInput,
|
||||||
|
TextInputPair,
|
||||||
|
TruncationStrategy,
|
||||||
|
)
|
||||||
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
|
from ...utils import logging
|
||||||
|
from ..xlm_roberta.tokenization_xlm_roberta import (
|
||||||
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
||||||
|
PRETRAINED_VOCAB_FILES_MAP,
|
||||||
|
VOCAB_FILES_NAMES,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_sentencepiece_available():
|
||||||
|
from .tokenization_layoutxlm import LayoutXLMTokenizer
|
||||||
|
else:
|
||||||
|
LayoutXLMTokenizer = None
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
|
||||||
|
"""
|
||||||
|
Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
||||||
|
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
|
||||||
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||||
|
methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||||
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning of
|
||||||
|
sequence. The token used is the :obj:`cls_token`.
|
||||||
|
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||||
|
sequence. The token used is the :obj:`sep_token`.
|
||||||
|
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
|
||||||
|
sequence classification or for a text and a question for question answering. It is also used as the last
|
||||||
|
token of a sequence built with special tokens.
|
||||||
|
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole sequence
|
||||||
|
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||||
|
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
|
||||||
|
The bounding box to use for the special [CLS] token.
|
||||||
|
sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
|
||||||
|
The bounding box to use for the special [SEP] token.
|
||||||
|
pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
|
||||||
|
The bounding box to use for the special [PAD] token.
|
||||||
|
pad_token_label (:obj:`int`, `optional`, defaults to -100):
|
||||||
|
The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
|
||||||
|
CrossEntropyLoss.
|
||||||
|
only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not to only label the first subword, in case word labels are provided.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
|
Additional special tokens used by the tokenizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
model_input_names = ["input_ids", "attention_mask"]
|
||||||
|
slow_tokenizer_class = LayoutXLMTokenizer
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_file=None,
|
||||||
|
tokenizer_file=None,
|
||||||
|
bos_token="<s>",
|
||||||
|
eos_token="</s>",
|
||||||
|
sep_token="</s>",
|
||||||
|
cls_token="<s>",
|
||||||
|
unk_token="<unk>",
|
||||||
|
pad_token="<pad>",
|
||||||
|
mask_token="<mask>",
|
||||||
|
cls_token_box=[0, 0, 0, 0],
|
||||||
|
sep_token_box=[1000, 1000, 1000, 1000],
|
||||||
|
pad_token_box=[0, 0, 0, 0],
|
||||||
|
pad_token_label=-100,
|
||||||
|
only_label_first_subword=True,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
# Mask token behave like a normal word, i.e. include the space before it
|
||||||
|
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
vocab_file,
|
||||||
|
tokenizer_file=tokenizer_file,
|
||||||
|
bos_token=bos_token,
|
||||||
|
eos_token=eos_token,
|
||||||
|
sep_token=sep_token,
|
||||||
|
cls_token=cls_token,
|
||||||
|
unk_token=unk_token,
|
||||||
|
pad_token=pad_token,
|
||||||
|
mask_token=mask_token,
|
||||||
|
cls_token_box=cls_token_box,
|
||||||
|
sep_token_box=sep_token_box,
|
||||||
|
pad_token_box=pad_token_box,
|
||||||
|
pad_token_label=pad_token_label,
|
||||||
|
only_label_first_subword=only_label_first_subword,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.vocab_file = vocab_file
|
||||||
|
self.can_save_slow_tokenizer = False if not self.vocab_file else True
|
||||||
|
|
||||||
|
# additional properties
|
||||||
|
self.cls_token_box = cls_token_box
|
||||||
|
self.sep_token_box = sep_token_box
|
||||||
|
self.pad_token_box = pad_token_box
|
||||||
|
self.pad_token_label = pad_token_label
|
||||||
|
self.only_label_first_subword = only_label_first_subword
|
||||||
|
|
||||||
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
|
||||||
|
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
|
||||||
|
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
|
||||||
|
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
|
||||||
|
add_special_tokens: bool = True,
|
||||||
|
padding: Union[bool, str, PaddingStrategy] = False,
|
||||||
|
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
stride: int = 0,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
return_token_type_ids: Optional[bool] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
return_overflowing_tokens: bool = False,
|
||||||
|
return_special_tokens_mask: bool = False,
|
||||||
|
return_offsets_mapping: bool = False,
|
||||||
|
return_length: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchEncoding:
|
||||||
|
"""
|
||||||
|
Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
|
||||||
|
sequences with word-level normalized bounding boxes and optional labels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
|
||||||
|
The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
|
||||||
|
(words of a single example or questions of a batch of examples) or a list of list of strings (batch of
|
||||||
|
words).
|
||||||
|
text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
|
||||||
|
The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
|
||||||
|
(pretokenized string).
|
||||||
|
boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
|
||||||
|
Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
|
||||||
|
word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
|
||||||
|
Word-level integer labels (for token classification tasks such as FUNSD, CORD).
|
||||||
|
"""
|
||||||
|
# Input type checking for clearer error
|
||||||
|
def _is_valid_text_input(t):
|
||||||
|
if isinstance(t, str):
|
||||||
|
# Strings are fine
|
||||||
|
return True
|
||||||
|
elif isinstance(t, (list, tuple)):
|
||||||
|
# List are fine as long as they are...
|
||||||
|
if len(t) == 0:
|
||||||
|
# ... empty
|
||||||
|
return True
|
||||||
|
elif isinstance(t[0], str):
|
||||||
|
# ... list of strings
|
||||||
|
return True
|
||||||
|
elif isinstance(t[0], (list, tuple)):
|
||||||
|
# ... list with an empty list or with a list of strings
|
||||||
|
return len(t[0]) == 0 or isinstance(t[0][0], str)
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if text_pair is not None:
|
||||||
|
# in case text + text_pair are provided, text = questions, text_pair = words
|
||||||
|
if not _is_valid_text_input(text):
|
||||||
|
raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
|
||||||
|
if not isinstance(text_pair, (list, tuple)):
|
||||||
|
raise ValueError(
|
||||||
|
"words must of type `List[str]` (single pretokenized example), "
|
||||||
|
"or `List[List[str]]` (batch of pretokenized examples)."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# in case only text is provided => must be words
|
||||||
|
if not isinstance(text, (list, tuple)):
|
||||||
|
raise ValueError(
|
||||||
|
"Words must of type `List[str]` (single pretokenized example), "
|
||||||
|
"or `List[List[str]]` (batch of pretokenized examples)."
|
||||||
|
)
|
||||||
|
|
||||||
|
if text_pair is not None:
|
||||||
|
is_batched = isinstance(text, (list, tuple))
|
||||||
|
else:
|
||||||
|
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
|
||||||
|
|
||||||
|
words = text if text_pair is None else text_pair
|
||||||
|
if boxes is None:
|
||||||
|
raise ValueError("You must provide corresponding bounding boxes")
|
||||||
|
if is_batched:
|
||||||
|
if len(words) != len(boxes):
|
||||||
|
raise ValueError("You must provide words and boxes for an equal amount of examples")
|
||||||
|
for words_example, boxes_example in zip(words, boxes):
|
||||||
|
if len(words_example) != len(boxes_example):
|
||||||
|
raise ValueError("You must provide as many words as there are bounding boxes")
|
||||||
|
else:
|
||||||
|
if len(words) != len(boxes):
|
||||||
|
raise ValueError("You must provide as many words as there are bounding boxes")
|
||||||
|
|
||||||
|
if is_batched:
|
||||||
|
if text_pair is not None and len(text) != len(text_pair):
|
||||||
|
raise ValueError(
|
||||||
|
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
|
||||||
|
)
|
||||||
|
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
|
||||||
|
is_pair = bool(text_pair is not None)
|
||||||
|
return self.batch_encode_plus(
|
||||||
|
batch_text_or_text_pairs=batch_text_or_text_pairs,
|
||||||
|
is_pair=is_pair,
|
||||||
|
boxes=boxes,
|
||||||
|
word_labels=word_labels,
|
||||||
|
add_special_tokens=add_special_tokens,
|
||||||
|
padding=padding,
|
||||||
|
truncation=truncation,
|
||||||
|
max_length=max_length,
|
||||||
|
stride=stride,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
return_token_type_ids=return_token_type_ids,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
return_overflowing_tokens=return_overflowing_tokens,
|
||||||
|
return_special_tokens_mask=return_special_tokens_mask,
|
||||||
|
return_offsets_mapping=return_offsets_mapping,
|
||||||
|
return_length=return_length,
|
||||||
|
verbose=verbose,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return self.encode_plus(
|
||||||
|
text=text,
|
||||||
|
text_pair=text_pair,
|
||||||
|
boxes=boxes,
|
||||||
|
word_labels=word_labels,
|
||||||
|
add_special_tokens=add_special_tokens,
|
||||||
|
padding=padding,
|
||||||
|
truncation=truncation,
|
||||||
|
max_length=max_length,
|
||||||
|
stride=stride,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
return_token_type_ids=return_token_type_ids,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
return_overflowing_tokens=return_overflowing_tokens,
|
||||||
|
return_special_tokens_mask=return_special_tokens_mask,
|
||||||
|
return_offsets_mapping=return_offsets_mapping,
|
||||||
|
return_length=return_length,
|
||||||
|
verbose=verbose,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
|
||||||
|
batched_input = [(text, pair)] if pair else [text]
|
||||||
|
encodings = self._tokenizer.encode_batch(
|
||||||
|
batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return encodings[0].tokens
|
||||||
|
|
||||||
|
def _batch_encode_plus(
|
||||||
|
self,
|
||||||
|
batch_text_or_text_pairs: Union[
|
||||||
|
List[TextInput],
|
||||||
|
List[TextInputPair],
|
||||||
|
List[PreTokenizedInput],
|
||||||
|
],
|
||||||
|
is_pair: bool = None,
|
||||||
|
boxes: Optional[List[List[List[int]]]] = None,
|
||||||
|
word_labels: Optional[List[List[int]]] = None,
|
||||||
|
add_special_tokens: bool = True,
|
||||||
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||||
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
stride: int = 0,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_tensors: Optional[str] = None,
|
||||||
|
return_token_type_ids: Optional[bool] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
return_overflowing_tokens: bool = False,
|
||||||
|
return_special_tokens_mask: bool = False,
|
||||||
|
return_offsets_mapping: bool = False,
|
||||||
|
return_length: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
**kwargs,
|
||||||
|
) -> BatchEncoding:
|
||||||
|
|
||||||
|
if not isinstance(batch_text_or_text_pairs, list):
|
||||||
|
raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
|
||||||
|
|
||||||
|
# Set the truncation and padding strategy and restore the initial configuration
|
||||||
|
self.set_truncation_and_padding(
|
||||||
|
padding_strategy=padding_strategy,
|
||||||
|
truncation_strategy=truncation_strategy,
|
||||||
|
max_length=max_length,
|
||||||
|
stride=stride,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_pair:
|
||||||
|
batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
|
||||||
|
|
||||||
|
encodings = self._tokenizer.encode_batch(
|
||||||
|
batch_text_or_text_pairs,
|
||||||
|
add_special_tokens=add_special_tokens,
|
||||||
|
is_pretokenized=True, # we set this to True as LayoutLMv2 always expects pretokenized inputs
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert encoding to dict
|
||||||
|
# `Tokens` has type: Tuple[
|
||||||
|
# List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
|
||||||
|
# List[EncodingFast]
|
||||||
|
# ]
|
||||||
|
# with nested dimensions corresponding to batch, overflows, sequence length
|
||||||
|
tokens_and_encodings = [
|
||||||
|
self._convert_encoding(
|
||||||
|
encoding=encoding,
|
||||||
|
return_token_type_ids=return_token_type_ids,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
return_overflowing_tokens=return_overflowing_tokens,
|
||||||
|
return_special_tokens_mask=return_special_tokens_mask,
|
||||||
|
return_offsets_mapping=True
|
||||||
|
if word_labels is not None
|
||||||
|
else return_offsets_mapping, # we use offsets to create the labels
|
||||||
|
return_length=return_length,
|
||||||
|
verbose=verbose,
|
||||||
|
)
|
||||||
|
for encoding in encodings
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
|
||||||
|
# From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
|
||||||
|
# (we say ~ because the number of overflow varies with the example in the batch)
|
||||||
|
#
|
||||||
|
# To match each overflowing sample with the original sample in the batch
|
||||||
|
# we add an overflow_to_sample_mapping array (see below)
|
||||||
|
sanitized_tokens = {}
|
||||||
|
for key in tokens_and_encodings[0][0].keys():
|
||||||
|
stack = [e for item, _ in tokens_and_encodings for e in item[key]]
|
||||||
|
sanitized_tokens[key] = stack
|
||||||
|
sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
|
||||||
|
|
||||||
|
# If returning overflowing tokens, we need to return a mapping
|
||||||
|
# from the batch idx to the original sample
|
||||||
|
if return_overflowing_tokens:
|
||||||
|
overflow_to_sample_mapping = []
|
||||||
|
for i, (toks, _) in enumerate(tokens_and_encodings):
|
||||||
|
overflow_to_sample_mapping += [i] * len(toks["input_ids"])
|
||||||
|
sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
|
||||||
|
|
||||||
|
for input_ids in sanitized_tokens["input_ids"]:
|
||||||
|
self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
|
||||||
|
|
||||||
|
# create the token boxes
|
||||||
|
token_boxes = []
|
||||||
|
for batch_index in range(len(sanitized_tokens["input_ids"])):
|
||||||
|
if return_overflowing_tokens:
|
||||||
|
original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
|
||||||
|
else:
|
||||||
|
original_index = batch_index
|
||||||
|
token_boxes_example = []
|
||||||
|
for id, sequence_id, word_id in zip(
|
||||||
|
sanitized_tokens["input_ids"][batch_index],
|
||||||
|
sanitized_encodings[batch_index].sequence_ids,
|
||||||
|
sanitized_encodings[batch_index].word_ids,
|
||||||
|
):
|
||||||
|
if word_id is not None:
|
||||||
|
if is_pair and sequence_id == 0:
|
||||||
|
token_boxes_example.append(self.pad_token_box)
|
||||||
|
else:
|
||||||
|
token_boxes_example.append(boxes[original_index][word_id])
|
||||||
|
else:
|
||||||
|
if id == self.cls_token_id:
|
||||||
|
token_boxes_example.append(self.cls_token_box)
|
||||||
|
elif id == self.sep_token_id:
|
||||||
|
token_boxes_example.append(self.sep_token_box)
|
||||||
|
elif id == self.pad_token_id:
|
||||||
|
token_boxes_example.append(self.pad_token_box)
|
||||||
|
else:
|
||||||
|
raise ValueError("Id not recognized")
|
||||||
|
token_boxes.append(token_boxes_example)
|
||||||
|
|
||||||
|
sanitized_tokens["bbox"] = token_boxes
|
||||||
|
|
||||||
|
# optionally, create the labels
|
||||||
|
if word_labels is not None:
|
||||||
|
labels = []
|
||||||
|
for batch_index in range(len(sanitized_tokens["input_ids"])):
|
||||||
|
if return_overflowing_tokens:
|
||||||
|
original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
|
||||||
|
else:
|
||||||
|
original_index = batch_index
|
||||||
|
labels_example = []
|
||||||
|
for id, offset, word_id in zip(
|
||||||
|
sanitized_tokens["input_ids"][batch_index],
|
||||||
|
sanitized_tokens["offset_mapping"][batch_index],
|
||||||
|
sanitized_encodings[batch_index].word_ids,
|
||||||
|
):
|
||||||
|
if word_id is not None:
|
||||||
|
if self.only_label_first_subword:
|
||||||
|
if offset[0] == 0:
|
||||||
|
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
|
||||||
|
labels_example.append(word_labels[original_index][word_id])
|
||||||
|
else:
|
||||||
|
labels_example.append(self.pad_token_label)
|
||||||
|
else:
|
||||||
|
labels_example.append(word_labels[original_index][word_id])
|
||||||
|
else:
|
||||||
|
labels_example.append(self.pad_token_label)
|
||||||
|
labels.append(labels_example)
|
||||||
|
|
||||||
|
sanitized_tokens["labels"] = labels
|
||||||
|
# finally, remove offsets if the user didn't want them
|
||||||
|
if not return_offsets_mapping:
|
||||||
|
del sanitized_tokens["offset_mapping"]
|
||||||
|
|
||||||
|
return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
def _encode_plus(
|
||||||
|
self,
|
||||||
|
text: Union[TextInput, PreTokenizedInput],
|
||||||
|
text_pair: Optional[PreTokenizedInput] = None,
|
||||||
|
boxes: Optional[List[List[int]]] = None,
|
||||||
|
word_labels: Optional[List[int]] = None,
|
||||||
|
add_special_tokens: bool = True,
|
||||||
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||||
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
stride: int = 0,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_tensors: Optional[bool] = None,
|
||||||
|
return_token_type_ids: Optional[bool] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
return_overflowing_tokens: bool = False,
|
||||||
|
return_special_tokens_mask: bool = False,
|
||||||
|
return_offsets_mapping: bool = False,
|
||||||
|
return_length: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchEncoding:
|
||||||
|
|
||||||
|
# make it a batched input
|
||||||
|
# 2 options:
|
||||||
|
# 1) only text, in case text must be a list of str
|
||||||
|
# 2) text + text_pair, in which case text = str and text_pair a list of str
|
||||||
|
batched_input = [(text, text_pair)] if text_pair else [text]
|
||||||
|
batched_boxes = [boxes]
|
||||||
|
batched_word_labels = [word_labels] if word_labels is not None else None
|
||||||
|
batched_output = self._batch_encode_plus(
|
||||||
|
batched_input,
|
||||||
|
is_pair=bool(text_pair is not None),
|
||||||
|
boxes=batched_boxes,
|
||||||
|
word_labels=batched_word_labels,
|
||||||
|
add_special_tokens=add_special_tokens,
|
||||||
|
padding_strategy=padding_strategy,
|
||||||
|
truncation_strategy=truncation_strategy,
|
||||||
|
max_length=max_length,
|
||||||
|
stride=stride,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
return_token_type_ids=return_token_type_ids,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
return_overflowing_tokens=return_overflowing_tokens,
|
||||||
|
return_special_tokens_mask=return_special_tokens_mask,
|
||||||
|
return_offsets_mapping=return_offsets_mapping,
|
||||||
|
return_length=return_length,
|
||||||
|
verbose=verbose,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return tensor is None, then we can remove the leading batch axis
|
||||||
|
# Overflowing tokens are returned as a batch of output so we keep them in this case
|
||||||
|
if return_tensors is None and not return_overflowing_tokens:
|
||||||
|
batched_output = BatchEncoding(
|
||||||
|
{
|
||||||
|
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
|
||||||
|
for key, value in batched_output.items()
|
||||||
|
},
|
||||||
|
batched_output.encodings,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
|
||||||
|
|
||||||
|
return batched_output
|
||||||
|
|
||||||
|
def _pad(
|
||||||
|
self,
|
||||||
|
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
||||||
|
max_length: maximum length of the returned list and optionally padding length (see below).
|
||||||
|
Will truncate by taking into account the special tokens.
|
||||||
|
padding_strategy: PaddingStrategy to use for padding.
|
||||||
|
|
||||||
|
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
||||||
|
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
||||||
|
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
||||||
|
The tokenizer padding sides are defined in self.padding_side:
|
||||||
|
|
||||||
|
- 'left': pads on the left of the sequences
|
||||||
|
- 'right': pads on the right of the sequences
|
||||||
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||||
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||||
|
>= 7.5 (Volta).
|
||||||
|
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||||
|
"""
|
||||||
|
# Load from model defaults
|
||||||
|
if return_attention_mask is None:
|
||||||
|
return_attention_mask = "attention_mask" in self.model_input_names
|
||||||
|
|
||||||
|
required_input = encoded_inputs[self.model_input_names[0]]
|
||||||
|
|
||||||
|
if padding_strategy == PaddingStrategy.LONGEST:
|
||||||
|
max_length = len(required_input)
|
||||||
|
|
||||||
|
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
||||||
|
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||||
|
|
||||||
|
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
||||||
|
|
||||||
|
# Initialize attention mask if not present.
|
||||||
|
if return_attention_mask and "attention_mask" not in encoded_inputs:
|
||||||
|
encoded_inputs["attention_mask"] = [1] * len(required_input)
|
||||||
|
|
||||||
|
if needs_to_be_padded:
|
||||||
|
difference = max_length - len(required_input)
|
||||||
|
if self.padding_side == "right":
|
||||||
|
if return_attention_mask:
|
||||||
|
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
||||||
|
if "token_type_ids" in encoded_inputs:
|
||||||
|
encoded_inputs["token_type_ids"] = (
|
||||||
|
encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
|
||||||
|
)
|
||||||
|
if "bbox" in encoded_inputs:
|
||||||
|
encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
|
||||||
|
if "labels" in encoded_inputs:
|
||||||
|
encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
|
||||||
|
if "special_tokens_mask" in encoded_inputs:
|
||||||
|
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
||||||
|
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
||||||
|
elif self.padding_side == "left":
|
||||||
|
if return_attention_mask:
|
||||||
|
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
||||||
|
if "token_type_ids" in encoded_inputs:
|
||||||
|
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
|
||||||
|
"token_type_ids"
|
||||||
|
]
|
||||||
|
if "bbox" in encoded_inputs:
|
||||||
|
encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
|
||||||
|
if "labels" in encoded_inputs:
|
||||||
|
encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["bbox"]
|
||||||
|
if "special_tokens_mask" in encoded_inputs:
|
||||||
|
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
||||||
|
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||||
|
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||||
|
adding special tokens. An XLM-RoBERTa sequence has the following format:
|
||||||
|
|
||||||
|
- single sequence: ``<s> X </s>``
|
||||||
|
- pair of sequences: ``<s> A </s></s> B </s>``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
|
||||||
|
not make use of token type ids, therefore a list of zeros is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of zeros.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
|
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
|
if not self.can_save_slow_tokenizer:
|
||||||
|
raise ValueError(
|
||||||
|
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
|
||||||
|
"tokenizer."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not os.path.isdir(save_directory):
|
||||||
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
|
||||||
|
return
|
||||||
|
out_vocab_file = os.path.join(
|
||||||
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||||
|
copyfile(self.vocab_file, out_vocab_file)
|
||||||
|
|
||||||
|
return (out_vocab_file,)
|
||||||
@@ -65,6 +65,15 @@ class DebertaV2Tokenizer:
|
|||||||
requires_backends(cls, ["sentencepiece"])
|
requires_backends(cls, ["sentencepiece"])
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutXLMTokenizer:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["sentencepiece"])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, *args, **kwargs):
|
||||||
|
requires_backends(cls, ["sentencepiece"])
|
||||||
|
|
||||||
|
|
||||||
class M2M100Tokenizer:
|
class M2M100Tokenizer:
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
requires_backends(self, ["sentencepiece"])
|
requires_backends(self, ["sentencepiece"])
|
||||||
|
|||||||
@@ -200,6 +200,15 @@ class LayoutLMv2TokenizerFast:
|
|||||||
requires_backends(cls, ["tokenizers"])
|
requires_backends(cls, ["tokenizers"])
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutXLMTokenizerFast:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["tokenizers"])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, *args, **kwargs):
|
||||||
|
requires_backends(cls, ["tokenizers"])
|
||||||
|
|
||||||
|
|
||||||
class LEDTokenizerFast:
|
class LEDTokenizerFast:
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
requires_backends(self, ["tokenizers"])
|
requires_backends(self, ["tokenizers"])
|
||||||
|
|||||||
@@ -50,6 +50,15 @@ class LayoutLMv2Processor:
|
|||||||
requires_backends(cls, ["vision"])
|
requires_backends(cls, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutXLMProcessor:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, *args, **kwargs):
|
||||||
|
requires_backends(cls, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class SegformerFeatureExtractor:
|
class SegformerFeatureExtractor:
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|||||||
423
tests/test_processor_layoutxlm.py
Normal file
423
tests/test_processor_layoutxlm.py
Normal file
@@ -0,0 +1,423 @@
|
|||||||
|
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
|
||||||
|
from transformers.file_utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
|
||||||
|
from transformers.models.layoutxlm import LayoutXLMTokenizer, LayoutXLMTokenizerFast
|
||||||
|
from transformers.testing_utils import (
|
||||||
|
require_pytesseract,
|
||||||
|
require_sentencepiece,
|
||||||
|
require_tokenizers,
|
||||||
|
require_torch,
|
||||||
|
slow,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_pytesseract_available():
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from transformers import LayoutLMv2FeatureExtractor, LayoutXLMProcessor
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
|
||||||
|
|
||||||
|
|
||||||
|
@require_pytesseract
|
||||||
|
@require_sentencepiece
|
||||||
|
@require_tokenizers
|
||||||
|
class LayoutXLMProcessorTest(unittest.TestCase):
|
||||||
|
tokenizer_class = LayoutXLMTokenizer
|
||||||
|
rust_tokenizer_class = LayoutXLMTokenizerFast
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
feature_extractor_map = {
|
||||||
|
"do_resize": True,
|
||||||
|
"size": 224,
|
||||||
|
"apply_ocr": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||||
|
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||||
|
fp.write(json.dumps(feature_extractor_map) + "\n")
|
||||||
|
|
||||||
|
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
||||||
|
return self.tokenizer_class.from_pretrained(SAMPLE_SP, **kwargs)
|
||||||
|
|
||||||
|
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
|
||||||
|
return self.rust_tokenizer_class.from_pretrained(SAMPLE_SP, **kwargs)
|
||||||
|
|
||||||
|
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
||||||
|
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
|
||||||
|
|
||||||
|
def get_feature_extractor(self, **kwargs):
|
||||||
|
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
|
||||||
|
def test_save_load_pretrained_default(self):
|
||||||
|
feature_extractor = self.get_feature_extractor()
|
||||||
|
tokenizers = self.get_tokenizers()
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
|
self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
|
||||||
|
|
||||||
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||||
|
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
||||||
|
|
||||||
|
def test_save_load_pretrained_additional_features(self):
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
|
||||||
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
# slow tokenizer
|
||||||
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
|
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
||||||
|
|
||||||
|
processor = LayoutXLMProcessor.from_pretrained(
|
||||||
|
self.tmpdirname,
|
||||||
|
use_fast=False,
|
||||||
|
bos_token="(BOS)",
|
||||||
|
eos_token="(EOS)",
|
||||||
|
do_resize=False,
|
||||||
|
size=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
|
self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
|
||||||
|
|
||||||
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||||
|
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
||||||
|
|
||||||
|
# fast tokenizer
|
||||||
|
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
|
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
||||||
|
|
||||||
|
processor = LayoutXLMProcessor.from_pretrained(
|
||||||
|
self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
|
self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
|
||||||
|
|
||||||
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||||
|
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
||||||
|
|
||||||
|
|
||||||
|
# different use cases tests
|
||||||
|
@require_sentencepiece
|
||||||
|
@require_torch
|
||||||
|
@require_pytesseract
|
||||||
|
class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||||
|
@cached_property
|
||||||
|
def get_images(self):
|
||||||
|
# we verify our implementation on 2 document images from the DocVQA dataset
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
|
||||||
|
|
||||||
|
image_1 = Image.open(ds[0]["file"]).convert("RGB")
|
||||||
|
image_2 = Image.open(ds[1]["file"]).convert("RGB")
|
||||||
|
|
||||||
|
return image_1, image_2
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def get_tokenizers(self):
|
||||||
|
slow_tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
|
||||||
|
fast_tokenizer = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
|
||||||
|
return [slow_tokenizer, fast_tokenizer]
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_processor_case_1(self):
|
||||||
|
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
|
||||||
|
|
||||||
|
feature_extractor = LayoutLMv2FeatureExtractor()
|
||||||
|
tokenizers = self.get_tokenizers
|
||||||
|
images = self.get_images
|
||||||
|
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# not batched
|
||||||
|
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
|
||||||
|
input_processor = processor(images[0], return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify image
|
||||||
|
self.assertAlmostEqual(
|
||||||
|
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
|
||||||
|
)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
# fmt: off
|
||||||
|
expected_decoding = "<s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
|
||||||
|
# fmt: on
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# batched
|
||||||
|
input_feat_extract = feature_extractor(images, return_tensors="pt")
|
||||||
|
input_processor = processor(images, padding=True, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify images
|
||||||
|
self.assertAlmostEqual(
|
||||||
|
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
|
||||||
|
)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
# fmt: off
|
||||||
|
expected_decoding = "<s> 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>" # noqa: E231
|
||||||
|
# fmt: on
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_processor_case_2(self):
|
||||||
|
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
|
||||||
|
|
||||||
|
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
||||||
|
tokenizers = self.get_tokenizers
|
||||||
|
images = self.get_images
|
||||||
|
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# not batched
|
||||||
|
words = ["hello", "world"]
|
||||||
|
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
|
||||||
|
input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["input_ids", "bbox", "attention_mask", "image"]
|
||||||
|
actual_keys = list(input_processor.keys())
|
||||||
|
for key in expected_keys:
|
||||||
|
self.assertIn(key, actual_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> hello world</s>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# batched
|
||||||
|
words = [["hello", "world"], ["my", "name", "is", "niels"]]
|
||||||
|
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
|
||||||
|
input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> hello world</s><pad><pad>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# verify bbox
|
||||||
|
expected_bbox = [
|
||||||
|
[0, 0, 0, 0],
|
||||||
|
[3, 2, 5, 1],
|
||||||
|
[6, 7, 4, 2],
|
||||||
|
[3, 9, 2, 4],
|
||||||
|
[1, 1, 2, 3],
|
||||||
|
[1, 1, 2, 3],
|
||||||
|
[1000, 1000, 1000, 1000],
|
||||||
|
]
|
||||||
|
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_processor_case_3(self):
|
||||||
|
# case 3: token classification (training), apply_ocr=False
|
||||||
|
|
||||||
|
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
||||||
|
tokenizers = self.get_tokenizers
|
||||||
|
images = self.get_images
|
||||||
|
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# not batched
|
||||||
|
words = ["weirdly", "world"]
|
||||||
|
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
|
||||||
|
word_labels = [1, 2]
|
||||||
|
input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> weirdly world</s>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# verify labels
|
||||||
|
expected_labels = [-100, 1, -100, 2, -100]
|
||||||
|
self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
|
||||||
|
|
||||||
|
# batched
|
||||||
|
words = [["hello", "world"], ["my", "name", "is", "niels"]]
|
||||||
|
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
|
||||||
|
word_labels = [[1, 2], [6, 3, 10, 2]]
|
||||||
|
input_processor = processor(
|
||||||
|
images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
|
||||||
|
)
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> my name is niels</s>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# verify bbox
|
||||||
|
expected_bbox = [
|
||||||
|
[0, 0, 0, 0],
|
||||||
|
[3, 2, 5, 1],
|
||||||
|
[6, 7, 4, 2],
|
||||||
|
[3, 9, 2, 4],
|
||||||
|
[1, 1, 2, 3],
|
||||||
|
[1, 1, 2, 3],
|
||||||
|
[1000, 1000, 1000, 1000],
|
||||||
|
]
|
||||||
|
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
|
||||||
|
|
||||||
|
# verify labels
|
||||||
|
expected_labels = [-100, 6, 3, 10, 2, -100, -100]
|
||||||
|
self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_processor_case_4(self):
|
||||||
|
# case 4: visual question answering (inference), apply_ocr=True
|
||||||
|
|
||||||
|
feature_extractor = LayoutLMv2FeatureExtractor()
|
||||||
|
tokenizers = self.get_tokenizers
|
||||||
|
images = self.get_images
|
||||||
|
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# not batched
|
||||||
|
question = "What's his name?"
|
||||||
|
input_processor = processor(images[0], question, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
# fmt: off
|
||||||
|
expected_decoding = "<s> What's his name?</s></s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
|
||||||
|
# fmt: on
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# batched
|
||||||
|
questions = ["How old is he?", "what's the time"]
|
||||||
|
input_processor = processor(
|
||||||
|
images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
|
||||||
|
)
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> what's the time</s></s> 7 ITC Limited REPORT AND ACCOUNTS 2013</s>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# verify bbox
|
||||||
|
# fmt: off
|
||||||
|
expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [1000, 1000, 1000, 1000]] # noqa: E231
|
||||||
|
# fmt: on
|
||||||
|
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_processor_case_5(self):
|
||||||
|
# case 5: visual question answering (inference), apply_ocr=False
|
||||||
|
|
||||||
|
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
||||||
|
tokenizers = self.get_tokenizers
|
||||||
|
images = self.get_images
|
||||||
|
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
# not batched
|
||||||
|
question = "What's his name?"
|
||||||
|
words = ["hello", "world"]
|
||||||
|
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
|
||||||
|
input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> What's his name?</s></s> hello world</s>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# batched
|
||||||
|
questions = ["How old is he?", "what's the time"]
|
||||||
|
words = [["hello", "world"], ["my", "name", "is", "niels"]]
|
||||||
|
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
|
||||||
|
input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
|
||||||
|
|
||||||
|
# verify keys
|
||||||
|
expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
|
||||||
|
actual_keys = sorted(list(input_processor.keys()))
|
||||||
|
self.assertListEqual(actual_keys, expected_keys)
|
||||||
|
|
||||||
|
# verify input_ids
|
||||||
|
expected_decoding = "<s> How old is he?</s></s> hello world</s><pad><pad>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
expected_decoding = "<s> what's the time</s></s> my name is niels</s>"
|
||||||
|
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
|
||||||
|
self.assertSequenceEqual(decoding, expected_decoding)
|
||||||
|
|
||||||
|
# verify bbox
|
||||||
|
expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
|
||||||
|
self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)
|
||||||
1785
tests/test_tokenization_layoutxlm.py
Normal file
1785
tests/test_tokenization_layoutxlm.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user