From 29b0aef871dfb938c6a56884d9dab77950057955 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Thu, 17 Jun 2021 16:37:54 +0200
Subject: [PATCH] Improve detr (#12147)

* Remove unused variables

* Improve docs

* Fix docs of segmentation masks

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 docs/source/model_doc/detr.rst                |  9 +++-
 .../models/detr/feature_extraction_detr.py    |  2 +-
 src/transformers/models/detr/modeling_detr.py | 53 +++++++------------
 3 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/docs/source/model_doc/detr.rst b/docs/source/model_doc/detr.rst
index dbd1fb99aa..279f11d042 100644
--- a/docs/source/model_doc/detr.rst
+++ b/docs/source/model_doc/detr.rst
@@ -40,6 +40,10 @@ baselines.*
 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
 <https://github.com/facebookresearch/detr>`__.
 
+The quickest way to get started with DETR is by checking the `example notebooks
+<https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR>`__ (which showcase both inference and
+fine-tuning on custom data).
+
 Here's a TLDR explaining how :class:`~transformers.DetrForObjectDetection` works:
 
 First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
@@ -130,7 +134,7 @@ As a summary, consider the following table:
 +---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+
 | **Format of annotations to provide to**     | {‘image_id’: int,                                       | {‘image_id’: int,                                                    | {‘file_name: str,                                                      |
 | :class:`~transformers.DetrFeatureExtractor` | ‘annotations’: List[Dict]}, each Dict being a COCO      | ‘annotations’: [List[Dict]] } (in case of COCO detection)            | ‘image_id: int,                                                        |
-|                                             | object annotation (containing keys "image_id",          |                                                                      | ‘segments_info’: List[Dict] }                                          |
+|                                             | object annotation                                       |                                                                      | ‘segments_info’: List[Dict] }                                          |
 |                                             |                                                         | or                                                                   |                                                                        |
 |                                             |                                                         |                                                                      | and masks_path (path to directory containing PNG files of the masks)   |
 |                                             |                                                         | {‘file_name’: str,                                                   |                                                                        |
@@ -151,7 +155,8 @@ In short, one should prepare the data either in COCO detection or COCO panoptic
 outputs of the model using one of the postprocessing methods of :class:`~transformers.DetrFeatureExtractor`. These can
 be be provided to either :obj:`CocoEvaluator` or :obj:`PanopticEvaluator`, which allow you to calculate metrics like
 mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the `original repository
-<https://github.com/facebookresearch/detr>`__. See the example notebooks for more info regarding evaluation.
+<https://github.com/facebookresearch/detr>`__. See the `example notebooks
+<https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR>`__ for more info regarding evaluation.
 
 
 DETR specific outputs
diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
index 014ba278e9..94a848f340 100644
--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -143,7 +143,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
             :obj:`do_resize` is set to :obj:`True`.
         do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to normalize the input with mean and standard deviation.
-        image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]s`):
+        image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
         image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 43d1edb94f..0e4721e2b3 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -98,9 +98,7 @@ class DetrModelOutput(Seq2SeqModelOutput):
 
     Args:
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model. If
-            :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1,
-            hidden_size)` is output.
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
         decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
             of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
@@ -148,7 +146,7 @@ class DetrObjectDetectionOutput(ModelOutput):
         pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use :class:`~transformers.DetrForObjectDetection.post_process` to retrieve the
+            possible padding). You can use :meth:`~transformers.DetrFeatureExtractor.post_process` to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (:obj:`list[Dict]`, `optional`):
             Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to
@@ -156,9 +154,6 @@ class DetrObjectDetectionOutput(ModelOutput):
             and :obj:`pred_boxes`) for each decoder layer.
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
-            1, hidden_size)` is output.
         decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
             of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
@@ -214,10 +209,10 @@ class DetrSegmentationOutput(ModelOutput):
         pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use :meth:`~transformers.DetrForObjectDetection.post_process` to retrieve the
+            possible padding). You can use :meth:`~transformers.DetrFeatureExtractor.post_process` to retrieve the
             unnormalized bounding boxes.
-        pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, width, height)`):
-            Segmentation masks for all queries. See also
+        pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, height/4, width/4)`):
+            Segmentation masks logits for all queries. See also
             :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` or
             :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic` to evaluate instance and panoptic
             segmentation masks respectively.
@@ -227,9 +222,6 @@ class DetrSegmentationOutput(ModelOutput):
             and :obj:`pred_boxes`) for each decoder layer.
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
-            1, hidden_size)` is output.
         decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
             of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
@@ -884,7 +876,6 @@ class DetrEncoder(DetrPreTrainedModel):
 
     Args:
         config: DetrConfig
-        embed_tokens (nn.Embedding): output embedding
     """
 
     def __init__(self, config: DetrConfig):
@@ -893,14 +884,9 @@ class DetrEncoder(DetrPreTrainedModel):
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
 
-        embed_dim = config.d_model
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-
         self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)])
 
-        # in the original DETR, no layernorm is used for the Encoder, as "normalize_before" is set to False by default there
+        # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
 
         self.init_weights()
 
@@ -998,16 +984,13 @@ class DetrDecoder(DetrPreTrainedModel):
 
     Args:
         config: DetrConfig
-        embed_tokens (nn.Embedding): output embedding
     """
 
-    def __init__(self, config: DetrConfig, embed_tokens: Optional[nn.Embedding] = None):
+    def __init__(self, config: DetrConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
-        self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
         self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])
         # in DETR, the decoder uses layernorm after the last decoder layer output
@@ -1371,11 +1354,11 @@ class DetrForObjectDetection(DetrPreTrainedModel):
     ):
         r"""
         labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`):
-            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 2 keys:
-            'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch respectively). The
-            class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the
-            image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image,
-            4)`.
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of
+            bounding boxes in the image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding
+            boxes in the image, 4)`.
 
         Returns:
 
@@ -1524,12 +1507,12 @@ class DetrForSegmentation(DetrPreTrainedModel):
     ):
         r"""
         labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`):
-            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 3 keys:
-            'class_labels', 'boxes' and 'masks' (the class labels, bounding boxes and segmentation masks of an image in
-            the batch respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len
-            :obj:`(number of bounding boxes in the image,)`, the boxes a :obj:`torch.FloatTensor` of shape
-            :obj:`(number of bounding boxes in the image, 4)` and the masks a :obj:`torch.FloatTensor` of shape
-            :obj:`(number of bounding boxes in the image, 4)`.
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the image,)`, the boxes a
+            :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, 4)` and the masks a
+            :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, height, width)`.
 
         Returns: