Improve LayoutLM (#9476)

* Add LayoutLMForSequenceClassification and integration tests Improve docs Add LayoutLM notebook to list of community notebooks * Make style & quality * Address comments by @sgugger, @patrickvonplaten and @LysandreJik * Fix rebase with master * Reformat in one line * Improve code examples as requested by @patrickvonplaten Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2021-01-12 15:26:32 +01:00
parent ccd1923f46
commit e45eba3b1c
8 changed files with 438 additions and 105 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -561,6 +561,7 @@ if is_torch_available():
        [
            "LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
            "LayoutLMForMaskedLM",
+            "LayoutLMForSequenceClassification",
            "LayoutLMForTokenClassification",
            "LayoutLMModel",
        ]
@@ -1597,6 +1598,7 @@ if TYPE_CHECKING:
        from .models.layoutlm import (
            LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
            LayoutLMForMaskedLM,
+            LayoutLMForSequenceClassification,
            LayoutLMForTokenClassification,
            LayoutLMModel,
        )
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -101,7 +101,12 @@ from ..funnel.modeling_funnel import (
    FunnelModel,
 )
 from ..gpt2.modeling_gpt2 import GPT2ForSequenceClassification, GPT2LMHeadModel, GPT2Model
-from ..layoutlm.modeling_layoutlm import LayoutLMForMaskedLM, LayoutLMForTokenClassification, LayoutLMModel
+from ..layoutlm.modeling_layoutlm import (
+    LayoutLMForMaskedLM,
+    LayoutLMForSequenceClassification,
+    LayoutLMForTokenClassification,
+    LayoutLMModel,
+)
 from ..led.modeling_led import (
    LEDForConditionalGeneration,
    LEDForQuestionAnswering,
@@ -470,6 +475,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
        (TransfoXLConfig, TransfoXLForSequenceClassification),
        (MPNetConfig, MPNetForSequenceClassification),
        (TapasConfig, TapasForSequenceClassification),
+        (LayoutLMConfig, LayoutLMForSequenceClassification),
    ]
 )

--- a/src/transformers/models/layoutlm/init.py
+++ b/src/transformers/models/layoutlm/init.py
@@ -33,6 +33,7 @@ if is_torch_available():
    _import_structure["modeling_layoutlm"] = [
        "LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LayoutLMForMaskedLM",
+        "LayoutLMForSequenceClassification",
        "LayoutLMForTokenClassification",
        "LayoutLMModel",
    ]
@@ -49,6 +50,7 @@ if TYPE_CHECKING:
        from .modeling_layoutlm import (
            LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
            LayoutLMForMaskedLM,
+            LayoutLMForSequenceClassification,
            LayoutLMForTokenClassification,
            LayoutLMModel,
        )
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -19,14 +19,15 @@ import math

 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import CrossEntropyLoss, MSELoss

 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
+    SequenceClassifierOutput,
    TokenClassifierOutput,
 )
 from ...modeling_utils import (
@@ -596,6 +597,7 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
    """

    config_class = LayoutLMConfig
+    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
    base_model_prefix = "layoutlm"
    _keys_to_ignore_on_load_missing = [r"position_ids"]

@@ -614,7 +616,7 @@ class LayoutLMPreTrainedModel(PreTrainedModel):

 LAYOUTLM_START_DOCSTRING = r"""
    The LayoutLM model was proposed in `LayoutLM: Pre-training of Text and Layout for Document Image Understanding
-    <https://arxiv.org/abs/1912.13318>`__ by....
+    <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei and Ming Zhou.

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
@@ -638,8 +640,10 @@ LAYOUTLM_INPUTS_DOCSTRING = r"""

            `What are input IDs? <../glossary.html#input-ids>`__
        bbox (:obj:`torch.LongTensor` of shape :obj:`({0}, 4)`, `optional`):
-            Bounding Boxes of each input sequence tokens. Selected in the range ``[0,
-            config.max_2d_position_embeddings-1]``.
+            Bounding boxes of each input sequence tokens. Selected in the range ``[0,
+            config.max_2d_position_embeddings-1]``. Each bounding box should be a normalized version in (x0, y0, x1,
+            y1) format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and
+            (x1, y1) represents the position of the lower right corner. See :ref:`Overview` for normalization.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for
            tokens that are NOT MASKED, ``0`` for MASKED tokens.
@@ -679,11 +683,6 @@ LAYOUTLM_INPUTS_DOCSTRING = r"""
    LAYOUTLM_START_DOCSTRING,
 )
 class LayoutLMModel(LayoutLMPreTrainedModel):
-
-    config_class = LayoutLMConfig
-    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "layoutlm"
-
    def __init__(self, config):
        super(LayoutLMModel, self).__init__(config)
        self.config = config
@@ -709,12 +708,7 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="layoutlm-base-uncased",
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
@@ -730,31 +724,36 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
        output_hidden_states=None,
        return_dict=None,
    ):
-        """
-        input_ids (torch.LongTensor of shape (batch_size, sequence_length)):
-            Indices of input sequence tokens in the vocabulary.
-        attention_mask (torch.FloatTensor of shape (batch_size, sequence_length), optional):
-            Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]: 1 for tokens
-            that are NOT MASKED, 0 for MASKED tokens.
-        token_type_ids (torch.LongTensor of shape (batch_size, sequence_length), optional):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]:
-            0 corresponds to a sentence A token, 1 corresponds to a sentence B token
-        position_ids (torch.LongTensor of shape (batch_size, sequence_length), optional):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range [0,
-            config.max_position_embeddings - 1].
-        head_mask (torch.FloatTensor of shape (num_heads,) or (num_layers, num_heads), optional):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in [0, 1]: 1 indicates
-            the head is not masked, 0 indicates the head is masked.
-        inputs_embeds (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size), optional):
-            Optionally, instead of passing input_ids you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert input_ids indices into associated vectors than the
-            model’s internal embedding lookup matrix.
-        output_attentions (bool, optional):
-            If set to True, the attentions tensors of all attention layers are returned.
-        output_hidden_states (bool, optional):
-            If set to True, the hidden states of all layers are returned.
-        return_dict (bool, optional):
-            If set to True, the model will return a ModelOutput instead of a plain tuple.
+        r"""
+        Returns:
+
+        Examples::
+
+            >>> from transformers import LayoutLMTokenizer, LayoutLMModel
+            >>> import torch
+
+            >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+            >>> model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+
+            >>> words = ["Hello", "world"]
+            >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+            >>> token_boxes = []
+            >>> for word, box in zip(words, normalized_word_boxes):
+            ...     word_tokens = tokenizer.tokenize(word)
+            ...     token_boxes.extend([box] * len(word_tokens))
+            >>> # add bounding boxes of cls + sep tokens
+            >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+            >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]
+            >>> attention_mask = encoding["attention_mask"]
+            >>> token_type_ids = encoding["token_type_ids"]
+            >>> bbox = torch.tensor([token_boxes])
+
+            >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+
+            >>> last_hidden_states = outputs.last_hidden_state
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -828,10 +827,6 @@ class LayoutLMModel(LayoutLMPreTrainedModel):

@add_start_docstrings("""LayoutLM Model with a `language modeling` head on top. """, LAYOUTLM_START_DOCSTRING)
 class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
-    config_class = LayoutLMConfig
-    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "layoutlm"
-
    def __init__(self, config):
        super().__init__(config)

@@ -850,12 +845,7 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
        self.cls.predictions.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="layoutlm-base-uncased",
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
@@ -872,7 +862,45 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
        output_hidden_states=None,
        return_dict=None,
    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``

+        Returns:
+
+        Examples::
+
+            >>> from transformers import LayoutLMTokenizer, LayoutLMForMaskedLM
+            >>> import torch
+
+            >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+            >>> model = LayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
+
+            >>> words = ["Hello", "[MASK]"]
+            >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+            >>> token_boxes = []
+            >>> for word, box in zip(words, normalized_word_boxes):
+            ...     word_tokens = tokenizer.tokenize(word)
+            ...     token_boxes.extend([box] * len(word_tokens))
+            >>> # add bounding boxes of cls + sep tokens
+            >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+            >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]
+            >>> attention_mask = encoding["attention_mask"]
+            >>> token_type_ids = encoding["token_type_ids"]
+            >>> bbox = torch.tensor([token_boxes])
+
+            >>> labels = tokenizer("Hello world", return_tensors="pt")["input_ids"]
+
+            >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+            ...                 labels=labels)
+
+            >>> loss = outputs.loss
+        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.layoutlm(
@@ -915,16 +943,12 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):

@add_start_docstrings(
    """
-    LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
+    LayoutLM Model with a sequence classification head on top (a linear layer on top of the pooled output) e.g. for
+    document image classification tasks such as the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset.
    """,
    LAYOUTLM_START_DOCSTRING,
 )
-class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
-    config_class = LayoutLMConfig
-    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "layoutlm"
-
+class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
@@ -938,12 +962,7 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
        return self.layoutlm.embeddings.word_embeddings

    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="layoutlm-base-uncased",
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
@@ -958,6 +977,162 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
        output_hidden_states=None,
        return_dict=None,
    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification
+            >>> import torch
+
+            >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+            >>> model = LayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+
+            >>> words = ["Hello", "world"]
+            >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+            >>> token_boxes = []
+            >>> for word, box in zip(words, normalized_word_boxes):
+            ...     word_tokens = tokenizer.tokenize(word)
+            ...     token_boxes.extend([box] * len(word_tokens))
+            >>> # add bounding boxes of cls + sep tokens
+            >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+            >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]
+            >>> attention_mask = encoding["attention_mask"]
+            >>> token_type_ids = encoding["token_type_ids"]
+            >>> bbox = torch.tensor([token_boxes])
+            >>> sequence_label = torch.tensor([1])
+
+            >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+            ...                 labels=sequence_label)
+
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    sequence labeling (information extraction) tasks such as the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__
+    dataset and the `SROIE <https://rrc.cvc.uab.es/?ch=13>`__ dataset.
+    """,
+    LAYOUTLM_START_DOCSTRING,
+)
+class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.layoutlm = LayoutLMModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification
+            >>> import torch
+
+            >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+            >>> model = LayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+
+            >>> words = ["Hello", "world"]
+            >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+            >>> token_boxes = []
+            >>> for word, box in zip(words, normalized_word_boxes):
+            ...     word_tokens = tokenizer.tokenize(word)
+            ...     token_boxes.extend([box] * len(word_tokens))
+            >>> # add bounding boxes of cls + sep tokens
+            >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+            >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]
+            >>> attention_mask = encoding["attention_mask"]
+            >>> token_type_ids = encoding["token_type_ids"]
+            >>> bbox = torch.tensor([token_boxes])
+            >>> token_labels = torch.tensor([1,1,0,0]).unsqueeze(0) # batch size of 1
+
+            >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+            ...                 labels=token_labels)
+
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
+        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.layoutlm(
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1182,6 +1182,15 @@ class LayoutLMForMaskedLM:
        requires_pytorch(self)


+class LayoutLMForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class LayoutLMForTokenClassification:
    def __init__(self, *args, **kwargs):
        requires_pytorch(self)