From 29b0aef871dfb938c6a56884d9dab77950057955 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Thu, 17 Jun 2021 16:37:54 +0200 Subject: [PATCH] Improve detr (#12147) * Remove unused variables * Improve docs * Fix docs of segmentation masks Co-authored-by: Lysandre Debut --- docs/source/model_doc/detr.rst | 9 +++- .../models/detr/feature_extraction_detr.py | 2 +- src/transformers/models/detr/modeling_detr.py | 53 +++++++------------ 3 files changed, 26 insertions(+), 38 deletions(-) diff --git a/docs/source/model_doc/detr.rst b/docs/source/model_doc/detr.rst index dbd1fb99aa..279f11d042 100644 --- a/docs/source/model_doc/detr.rst +++ b/docs/source/model_doc/detr.rst @@ -40,6 +40,10 @@ baselines.* This model was contributed by `nielsr `__. The original code can be found `here `__. +The quickest way to get started with DETR is by checking the `example notebooks +`__ (which showcase both inference and +fine-tuning on custom data). + Here's a TLDR explaining how :class:`~transformers.DetrForObjectDetection` works: First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use @@ -130,7 +134,7 @@ As a summary, consider the following table: +---------------------------------------------+---------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+ | **Format of annotations to provide to** | {‘image_id’: int, | {‘image_id’: int, | {‘file_name: str, | | :class:`~transformers.DetrFeatureExtractor` | ‘annotations’: List[Dict]}, each Dict being a COCO | ‘annotations’: [List[Dict]] } (in case of COCO detection) | ‘image_id: int, | -| | object annotation (containing keys "image_id", | | ‘segments_info’: List[Dict] } | +| | object annotation | | ‘segments_info’: List[Dict] } | | | | or | | | | | | and masks_path (path to directory containing PNG files of the masks) | | | | {‘file_name’: str, | | @@ -151,7 +155,8 @@ In short, one should prepare the data either in COCO detection or COCO panoptic outputs of the model using one of the postprocessing methods of :class:`~transformers.DetrFeatureExtractor`. These can be be provided to either :obj:`CocoEvaluator` or :obj:`PanopticEvaluator`, which allow you to calculate metrics like mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the `original repository -`__. See the example notebooks for more info regarding evaluation. +`__. See the `example notebooks +`__ for more info regarding evaluation. DETR specific outputs diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index 014ba278e9..94a848f340 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -143,7 +143,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): :obj:`do_resize` is set to :obj:`True`. do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to normalize the input with mean and standard deviation. - image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]s`): + image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`): The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean. image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`): The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 43d1edb94f..0e4721e2b3 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -98,9 +98,7 @@ class DetrModelOutput(Seq2SeqModelOutput): Args: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the decoder of the model. If - :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, - hidden_size)` is output. + Sequence of hidden-states at the output of the last layer of the decoder of the model. decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of @@ -148,7 +146,7 @@ class DetrObjectDetectionOutput(ModelOutput): pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`): Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding - possible padding). You can use :class:`~transformers.DetrForObjectDetection.post_process` to retrieve the + possible padding). You can use :meth:`~transformers.DetrFeatureExtractor.post_process` to retrieve the unnormalized bounding boxes. auxiliary_outputs (:obj:`list[Dict]`, `optional`): Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to @@ -156,9 +154,6 @@ class DetrObjectDetectionOutput(ModelOutput): and :obj:`pred_boxes`) for each decoder layer. last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the decoder of the model. - - If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, - 1, hidden_size)` is output. decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of @@ -214,10 +209,10 @@ class DetrSegmentationOutput(ModelOutput): pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`): Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding - possible padding). You can use :meth:`~transformers.DetrForObjectDetection.post_process` to retrieve the + possible padding). You can use :meth:`~transformers.DetrFeatureExtractor.post_process` to retrieve the unnormalized bounding boxes. - pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, width, height)`): - Segmentation masks for all queries. See also + pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, height/4, width/4)`): + Segmentation masks logits for all queries. See also :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` or :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic` to evaluate instance and panoptic segmentation masks respectively. @@ -227,9 +222,6 @@ class DetrSegmentationOutput(ModelOutput): and :obj:`pred_boxes`) for each decoder layer. last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the decoder of the model. - - If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, - 1, hidden_size)` is output. decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of @@ -884,7 +876,6 @@ class DetrEncoder(DetrPreTrainedModel): Args: config: DetrConfig - embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: DetrConfig): @@ -893,14 +884,9 @@ class DetrEncoder(DetrPreTrainedModel): self.dropout = config.dropout self.layerdrop = config.encoder_layerdrop - embed_dim = config.d_model - self.padding_idx = config.pad_token_id - self.max_source_positions = config.max_position_embeddings - self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)]) - # in the original DETR, no layernorm is used for the Encoder, as "normalize_before" is set to False by default there + # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default self.init_weights() @@ -998,16 +984,13 @@ class DetrDecoder(DetrPreTrainedModel): Args: config: DetrConfig - embed_tokens (nn.Embedding): output embedding """ - def __init__(self, config: DetrConfig, embed_tokens: Optional[nn.Embedding] = None): + def __init__(self, config: DetrConfig): super().__init__(config) self.dropout = config.dropout self.layerdrop = config.decoder_layerdrop - self.padding_idx = config.pad_token_id self.max_target_positions = config.max_position_embeddings - self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)]) # in DETR, the decoder uses layernorm after the last decoder layer output @@ -1371,11 +1354,11 @@ class DetrForObjectDetection(DetrPreTrainedModel): ): r""" labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`): - Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 2 keys: - 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch respectively). The - class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the - image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, - 4)`. + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of + bounding boxes in the image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding + boxes in the image, 4)`. Returns: @@ -1524,12 +1507,12 @@ class DetrForSegmentation(DetrPreTrainedModel): ): r""" labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`): - Labels for computing the bipartite matching loss. List of dicts, each dictionary containing 3 keys: - 'class_labels', 'boxes' and 'masks' (the class labels, bounding boxes and segmentation masks of an image in - the batch respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len - :obj:`(number of bounding boxes in the image,)`, the boxes a :obj:`torch.FloatTensor` of shape - :obj:`(number of bounding boxes in the image, 4)` and the masks a :obj:`torch.FloatTensor` of shape - :obj:`(number of bounding boxes in the image, 4)`. + Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each + dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels, + bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves + should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the image,)`, the boxes a + :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, 4)` and the masks a + :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, height, width)`. Returns: