From fbb454307dd60e071273ac741a6074ebccbc6d1a Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 21 Mar 2022 17:34:10 +0100 Subject: [PATCH] [SegFormer] Remove unused attributes (#16285) * Remove unused attributes * Add link to blog and add clarification about input size * Improve readability of the code Co-authored-by: Niels Rogge --- docs/source/model_doc/segformer.mdx | 3 +- .../segformer/configuration_segformer.py | 8 -- .../models/segformer/modeling_segformer.py | 87 ++++++++++--------- 3 files changed, 46 insertions(+), 52 deletions(-) diff --git a/docs/source/model_doc/segformer.mdx b/docs/source/model_doc/segformer.mdx index 9406891ef3..9563e08430 100644 --- a/docs/source/model_doc/segformer.mdx +++ b/docs/source/model_doc/segformer.mdx @@ -50,7 +50,8 @@ Tips: ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be found on the [hub](https://huggingface.co/models?other=segformer). - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and - fine-tuning on custom data). + fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data. +- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. - One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py index bc97dc773e..58c8b29f73 100644 --- a/src/transformers/models/segformer/configuration_segformer.py +++ b/src/transformers/models/segformer/configuration_segformer.py @@ -40,8 +40,6 @@ class SegformerConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - image_size (`int`, *optional*, defaults to 512): - The size (resolution) of each image. num_channels (`int`, *optional*, defaults to 3): The number of input channels. num_encoder_blocks (`int`, *optional*, defaults to 4): @@ -52,8 +50,6 @@ class SegformerConfig(PretrainedConfig): Sequence reduction ratios in each encoder block. hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]): Dimension of each of the encoder blocks. - downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]): - Downsample rate of the image resolution compared to the original image size before each encoder block. patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]): Patch size before each encoder block. strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]): @@ -101,13 +97,11 @@ class SegformerConfig(PretrainedConfig): def __init__( self, - image_size=224, num_channels=3, num_encoder_blocks=4, depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], hidden_sizes=[32, 64, 160, 256], - downsampling_rates=[1, 4, 8, 16], patch_sizes=[7, 3, 3, 3], strides=[4, 2, 2, 2], num_attention_heads=[1, 2, 5, 8], @@ -133,13 +127,11 @@ class SegformerConfig(PretrainedConfig): FutureWarning, ) - self.image_size = image_size self.num_channels = num_channels self.num_encoder_blocks = num_encoder_blocks self.depths = depths self.sr_ratios = sr_ratios self.hidden_sizes = hidden_sizes - self.downsampling_rates = downsampling_rates self.patch_sizes = patch_sizes self.strides = strides self.mlp_ratios = mlp_ratios diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 6263c48d6d..461b15f0c6 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -15,7 +15,6 @@ """ PyTorch SegFormer model.""" -import collections import math from typing import Optional, Tuple, Union @@ -58,18 +57,8 @@ SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Inspired by -# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py -# From PyTorch internals -def to_2tuple(x): - if isinstance(x, collections.abc.Iterable): - return x - return (x, x) - - -# Stochastic depth implementation -# Taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py -def drop_path(x, drop_prob: float = 0.0, training: bool = False): +# Copied from transformers.models.convnext.modeling_convnext.drop_path +def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True): """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop @@ -87,7 +76,8 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False): return output -class DropPath(nn.Module): +# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer +class SegformerDropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" def __init__(self, drop_prob=None): @@ -99,34 +89,35 @@ class DropPath(nn.Module): class SegformerOverlapPatchEmbeddings(nn.Module): - """Construct the patch embeddings from an image.""" + """Construct the overlapping patch embeddings.""" - def __init__(self, image_size, patch_size, stride, num_channels, hidden_size): + def __init__(self, patch_size, stride, num_channels, hidden_size): super().__init__() - image_size = to_2tuple(image_size) - patch_size = to_2tuple(patch_size) - self.height, self.width = image_size[0] // patch_size[0], image_size[1] // patch_size[1] - self.num_patches = self.height * self.width self.proj = nn.Conv2d( num_channels, hidden_size, kernel_size=patch_size, stride=stride, - padding=(patch_size[0] // 2, patch_size[1] // 2), + padding=patch_size // 2, ) self.layer_norm = nn.LayerNorm(hidden_size) def forward(self, pixel_values): - x = self.proj(pixel_values) - _, _, height, width = x.shape - x = x.flatten(2).transpose(1, 2) - x = self.layer_norm(x) - return x, height, width + embeddings = self.proj(pixel_values) + _, _, height, width = embeddings.shape + # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels) + # this can be fed to a Transformer layer + embeddings = embeddings.flatten(2).transpose(1, 2) + embeddings = self.layer_norm(embeddings) + return embeddings, height, width class SegformerEfficientSelfAttention(nn.Module): - def __init__(self, config, hidden_size, num_attention_heads, sr_ratio): + """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT + paper](https://arxiv.org/abs/2102.12122).""" + + def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio): super().__init__() self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads @@ -146,15 +137,17 @@ class SegformerEfficientSelfAttention(nn.Module): self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.sr_ratio = sr_ratio - if sr_ratio > 1: - self.sr = nn.Conv2d(hidden_size, hidden_size, kernel_size=sr_ratio, stride=sr_ratio) + self.sr_ratio = sequence_reduction_ratio + if sequence_reduction_ratio > 1: + self.sr = nn.Conv2d( + hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio + ) self.layer_norm = nn.LayerNorm(hidden_size) - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) + def transpose_for_scores(self, hidden_states): + new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + hidden_states = hidden_states.view(*new_shape) + return hidden_states.permute(0, 2, 1, 3) def forward( self, @@ -167,8 +160,11 @@ class SegformerEfficientSelfAttention(nn.Module): if self.sr_ratio > 1: batch_size, seq_len, num_channels = hidden_states.shape + # Reshape to (batch_size, num_channels, height, width) hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width) + # Apply sequence reduction hidden_states = self.sr(hidden_states) + # Reshape back to (batch_size, seq_len, num_channels) hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1) hidden_states = self.layer_norm(hidden_states) @@ -211,10 +207,13 @@ class SegformerSelfOutput(nn.Module): class SegformerAttention(nn.Module): - def __init__(self, config, hidden_size, num_attention_heads, sr_ratio): + def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio): super().__init__() self.self = SegformerEfficientSelfAttention( - config=config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio + config=config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, ) self.output = SegformerSelfOutput(config, hidden_size=hidden_size) self.pruned_heads = set() @@ -285,13 +284,16 @@ class SegformerMixFFN(nn.Module): class SegformerLayer(nn.Module): """This corresponds to the Block class in the original implementation.""" - def __init__(self, config, hidden_size, num_attention_heads, drop_path, sr_ratio, mlp_ratio): + def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio): super().__init__() self.layer_norm_1 = nn.LayerNorm(hidden_size) self.attention = SegformerAttention( - config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio + config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, ) - self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.layer_norm_2 = nn.LayerNorm(hidden_size) mlp_hidden_size = int(hidden_size * mlp_ratio) self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size) @@ -328,14 +330,13 @@ class SegformerEncoder(nn.Module): self.config = config # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] # patch embeddings embeddings = [] for i in range(config.num_encoder_blocks): embeddings.append( SegformerOverlapPatchEmbeddings( - image_size=config.image_size // config.downsampling_rates[i], patch_size=config.patch_sizes[i], stride=config.strides[i], num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1], @@ -358,8 +359,8 @@ class SegformerEncoder(nn.Module): config, hidden_size=config.hidden_sizes[i], num_attention_heads=config.num_attention_heads[i], - drop_path=dpr[cur + j], - sr_ratio=config.sr_ratios[i], + drop_path=drop_path_decays[cur + j], + sequence_reduction_ratio=config.sr_ratios[i], mlp_ratio=config.mlp_ratios[i], ) )