From fbb454307dd60e071273ac741a6074ebccbc6d1a Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Mon, 21 Mar 2022 17:34:10 +0100
Subject: [PATCH] [SegFormer] Remove unused attributes (#16285)

* Remove unused attributes

* Add link to blog and add clarification about input size

* Improve readability of the code

Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
---
 docs/source/model_doc/segformer.mdx           |  3 +-
 .../segformer/configuration_segformer.py      |  8 --
 .../models/segformer/modeling_segformer.py    | 87 ++++++++++---------
 3 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/docs/source/model_doc/segformer.mdx b/docs/source/model_doc/segformer.mdx
index 9406891ef3..9563e08430 100644
--- a/docs/source/model_doc/segformer.mdx
+++ b/docs/source/model_doc/segformer.mdx
@@ -50,7 +50,8 @@ Tips:
   ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
   found on the [hub](https://huggingface.co/models?other=segformer).
 - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
-  fine-tuning on custom data).
+  fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
+- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. 
 - One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
   for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
   the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index bc97dc773e..58c8b29f73 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -40,8 +40,6 @@ class SegformerConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        image_size (`int`, *optional*, defaults to 512):
-            The size (resolution) of each image.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         num_encoder_blocks (`int`, *optional*, defaults to 4):
@@ -52,8 +50,6 @@ class SegformerConfig(PretrainedConfig):
             Sequence reduction ratios in each encoder block.
         hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
             Dimension of each of the encoder blocks.
-        downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]):
-            Downsample rate of the image resolution compared to the original image size before each encoder block.
         patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
             Patch size before each encoder block.
         strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
@@ -101,13 +97,11 @@ class SegformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        image_size=224,
         num_channels=3,
         num_encoder_blocks=4,
         depths=[2, 2, 2, 2],
         sr_ratios=[8, 4, 2, 1],
         hidden_sizes=[32, 64, 160, 256],
-        downsampling_rates=[1, 4, 8, 16],
         patch_sizes=[7, 3, 3, 3],
         strides=[4, 2, 2, 2],
         num_attention_heads=[1, 2, 5, 8],
@@ -133,13 +127,11 @@ class SegformerConfig(PretrainedConfig):
                 FutureWarning,
             )
 
-        self.image_size = image_size
         self.num_channels = num_channels
         self.num_encoder_blocks = num_encoder_blocks
         self.depths = depths
         self.sr_ratios = sr_ratios
         self.hidden_sizes = hidden_sizes
-        self.downsampling_rates = downsampling_rates
         self.patch_sizes = patch_sizes
         self.strides = strides
         self.mlp_ratios = mlp_ratios
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 6263c48d6d..461b15f0c6 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -15,7 +15,6 @@
 """ PyTorch SegFormer model."""
 
 
-import collections
 import math
 from typing import Optional, Tuple, Union
 
@@ -58,18 +57,8 @@ SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
 
 
-# Inspired by
-# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
-# From PyTorch internals
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-
-
-# Stochastic depth implementation
-# Taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
-def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+# Copied from transformers.models.convnext.modeling_convnext.drop_path
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
     DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
@@ -87,7 +76,8 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
     return output
 
 
-class DropPath(nn.Module):
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer
+class SegformerDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
     def __init__(self, drop_prob=None):
@@ -99,34 +89,35 @@ class DropPath(nn.Module):
 
 
 class SegformerOverlapPatchEmbeddings(nn.Module):
-    """Construct the patch embeddings from an image."""
+    """Construct the overlapping patch embeddings."""
 
-    def __init__(self, image_size, patch_size, stride, num_channels, hidden_size):
+    def __init__(self, patch_size, stride, num_channels, hidden_size):
         super().__init__()
-        image_size = to_2tuple(image_size)
-        patch_size = to_2tuple(patch_size)
-        self.height, self.width = image_size[0] // patch_size[0], image_size[1] // patch_size[1]
-        self.num_patches = self.height * self.width
         self.proj = nn.Conv2d(
             num_channels,
             hidden_size,
             kernel_size=patch_size,
             stride=stride,
-            padding=(patch_size[0] // 2, patch_size[1] // 2),
+            padding=patch_size // 2,
         )
 
         self.layer_norm = nn.LayerNorm(hidden_size)
 
     def forward(self, pixel_values):
-        x = self.proj(pixel_values)
-        _, _, height, width = x.shape
-        x = x.flatten(2).transpose(1, 2)
-        x = self.layer_norm(x)
-        return x, height, width
+        embeddings = self.proj(pixel_values)
+        _, _, height, width = embeddings.shape
+        # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
 
 
 class SegformerEfficientSelfAttention(nn.Module):
-    def __init__(self, config, hidden_size, num_attention_heads, sr_ratio):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://arxiv.org/abs/2102.12122)."""
+
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
         super().__init__()
         self.hidden_size = hidden_size
         self.num_attention_heads = num_attention_heads
@@ -146,15 +137,17 @@ class SegformerEfficientSelfAttention(nn.Module):
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
-        self.sr_ratio = sr_ratio
-        if sr_ratio > 1:
-            self.sr = nn.Conv2d(hidden_size, hidden_size, kernel_size=sr_ratio, stride=sr_ratio)
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
+            )
             self.layer_norm = nn.LayerNorm(hidden_size)
 
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
+    def transpose_for_scores(self, hidden_states):
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        hidden_states = hidden_states.view(*new_shape)
+        return hidden_states.permute(0, 2, 1, 3)
 
     def forward(
         self,
@@ -167,8 +160,11 @@ class SegformerEfficientSelfAttention(nn.Module):
 
         if self.sr_ratio > 1:
             batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
             hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
             hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
             hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
             hidden_states = self.layer_norm(hidden_states)
 
@@ -211,10 +207,13 @@ class SegformerSelfOutput(nn.Module):
 
 
 class SegformerAttention(nn.Module):
-    def __init__(self, config, hidden_size, num_attention_heads, sr_ratio):
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
         super().__init__()
         self.self = SegformerEfficientSelfAttention(
-            config=config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
         )
         self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
         self.pruned_heads = set()
@@ -285,13 +284,16 @@ class SegformerMixFFN(nn.Module):
 class SegformerLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
-    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sr_ratio, mlp_ratio):
+    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
         super().__init__()
         self.layer_norm_1 = nn.LayerNorm(hidden_size)
         self.attention = SegformerAttention(
-            config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
         )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.layer_norm_2 = nn.LayerNorm(hidden_size)
         mlp_hidden_size = int(hidden_size * mlp_ratio)
         self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
@@ -328,14 +330,13 @@ class SegformerEncoder(nn.Module):
         self.config = config
 
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
 
         # patch embeddings
         embeddings = []
         for i in range(config.num_encoder_blocks):
             embeddings.append(
                 SegformerOverlapPatchEmbeddings(
-                    image_size=config.image_size // config.downsampling_rates[i],
                     patch_size=config.patch_sizes[i],
                     stride=config.strides[i],
                     num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
@@ -358,8 +359,8 @@ class SegformerEncoder(nn.Module):
                         config,
                         hidden_size=config.hidden_sizes[i],
                         num_attention_heads=config.num_attention_heads[i],
-                        drop_path=dpr[cur + j],
-                        sr_ratio=config.sr_ratios[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequence_reduction_ratio=config.sr_ratios[i],
                         mlp_ratio=config.mlp_ratios[i],
                     )
                 )