[SegFormer] Remove unused attributes (#16285)
* Remove unused attributes * Add link to blog and add clarification about input size * Improve readability of the code Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
This commit is contained in:
@@ -50,7 +50,8 @@ Tips:
|
|||||||
ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
|
ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
|
||||||
found on the [hub](https://huggingface.co/models?other=segformer).
|
found on the [hub](https://huggingface.co/models?other=segformer).
|
||||||
- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
|
- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
|
||||||
fine-tuning on custom data).
|
fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
|
||||||
|
- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`.
|
||||||
- One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
|
- One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
|
||||||
for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
|
for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
|
||||||
the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
|
the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
|
||||||
|
|||||||
@@ -40,8 +40,6 @@ class SegformerConfig(PretrainedConfig):
|
|||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_size (`int`, *optional*, defaults to 512):
|
|
||||||
The size (resolution) of each image.
|
|
||||||
num_channels (`int`, *optional*, defaults to 3):
|
num_channels (`int`, *optional*, defaults to 3):
|
||||||
The number of input channels.
|
The number of input channels.
|
||||||
num_encoder_blocks (`int`, *optional*, defaults to 4):
|
num_encoder_blocks (`int`, *optional*, defaults to 4):
|
||||||
@@ -52,8 +50,6 @@ class SegformerConfig(PretrainedConfig):
|
|||||||
Sequence reduction ratios in each encoder block.
|
Sequence reduction ratios in each encoder block.
|
||||||
hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
|
hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
|
||||||
Dimension of each of the encoder blocks.
|
Dimension of each of the encoder blocks.
|
||||||
downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]):
|
|
||||||
Downsample rate of the image resolution compared to the original image size before each encoder block.
|
|
||||||
patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
|
patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
|
||||||
Patch size before each encoder block.
|
Patch size before each encoder block.
|
||||||
strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
|
strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
|
||||||
@@ -101,13 +97,11 @@ class SegformerConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
image_size=224,
|
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
num_encoder_blocks=4,
|
num_encoder_blocks=4,
|
||||||
depths=[2, 2, 2, 2],
|
depths=[2, 2, 2, 2],
|
||||||
sr_ratios=[8, 4, 2, 1],
|
sr_ratios=[8, 4, 2, 1],
|
||||||
hidden_sizes=[32, 64, 160, 256],
|
hidden_sizes=[32, 64, 160, 256],
|
||||||
downsampling_rates=[1, 4, 8, 16],
|
|
||||||
patch_sizes=[7, 3, 3, 3],
|
patch_sizes=[7, 3, 3, 3],
|
||||||
strides=[4, 2, 2, 2],
|
strides=[4, 2, 2, 2],
|
||||||
num_attention_heads=[1, 2, 5, 8],
|
num_attention_heads=[1, 2, 5, 8],
|
||||||
@@ -133,13 +127,11 @@ class SegformerConfig(PretrainedConfig):
|
|||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.image_size = image_size
|
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
self.num_encoder_blocks = num_encoder_blocks
|
self.num_encoder_blocks = num_encoder_blocks
|
||||||
self.depths = depths
|
self.depths = depths
|
||||||
self.sr_ratios = sr_ratios
|
self.sr_ratios = sr_ratios
|
||||||
self.hidden_sizes = hidden_sizes
|
self.hidden_sizes = hidden_sizes
|
||||||
self.downsampling_rates = downsampling_rates
|
|
||||||
self.patch_sizes = patch_sizes
|
self.patch_sizes = patch_sizes
|
||||||
self.strides = strides
|
self.strides = strides
|
||||||
self.mlp_ratios = mlp_ratios
|
self.mlp_ratios = mlp_ratios
|
||||||
|
|||||||
@@ -15,7 +15,6 @@
|
|||||||
""" PyTorch SegFormer model."""
|
""" PyTorch SegFormer model."""
|
||||||
|
|
||||||
|
|
||||||
import collections
|
|
||||||
import math
|
import math
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
@@ -58,18 +57,8 @@ SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# Inspired by
|
# Copied from transformers.models.convnext.modeling_convnext.drop_path
|
||||||
# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
|
def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
|
||||||
# From PyTorch internals
|
|
||||||
def to_2tuple(x):
|
|
||||||
if isinstance(x, collections.abc.Iterable):
|
|
||||||
return x
|
|
||||||
return (x, x)
|
|
||||||
|
|
||||||
|
|
||||||
# Stochastic depth implementation
|
|
||||||
# Taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
|
|
||||||
def drop_path(x, drop_prob: float = 0.0, training: bool = False):
|
|
||||||
"""
|
"""
|
||||||
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
|
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
|
||||||
DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
|
DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
|
||||||
@@ -87,7 +76,8 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
class DropPath(nn.Module):
|
# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer
|
||||||
|
class SegformerDropPath(nn.Module):
|
||||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
||||||
|
|
||||||
def __init__(self, drop_prob=None):
|
def __init__(self, drop_prob=None):
|
||||||
@@ -99,34 +89,35 @@ class DropPath(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class SegformerOverlapPatchEmbeddings(nn.Module):
|
class SegformerOverlapPatchEmbeddings(nn.Module):
|
||||||
"""Construct the patch embeddings from an image."""
|
"""Construct the overlapping patch embeddings."""
|
||||||
|
|
||||||
def __init__(self, image_size, patch_size, stride, num_channels, hidden_size):
|
def __init__(self, patch_size, stride, num_channels, hidden_size):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
image_size = to_2tuple(image_size)
|
|
||||||
patch_size = to_2tuple(patch_size)
|
|
||||||
self.height, self.width = image_size[0] // patch_size[0], image_size[1] // patch_size[1]
|
|
||||||
self.num_patches = self.height * self.width
|
|
||||||
self.proj = nn.Conv2d(
|
self.proj = nn.Conv2d(
|
||||||
num_channels,
|
num_channels,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
kernel_size=patch_size,
|
kernel_size=patch_size,
|
||||||
stride=stride,
|
stride=stride,
|
||||||
padding=(patch_size[0] // 2, patch_size[1] // 2),
|
padding=patch_size // 2,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.layer_norm = nn.LayerNorm(hidden_size)
|
self.layer_norm = nn.LayerNorm(hidden_size)
|
||||||
|
|
||||||
def forward(self, pixel_values):
|
def forward(self, pixel_values):
|
||||||
x = self.proj(pixel_values)
|
embeddings = self.proj(pixel_values)
|
||||||
_, _, height, width = x.shape
|
_, _, height, width = embeddings.shape
|
||||||
x = x.flatten(2).transpose(1, 2)
|
# (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
|
||||||
x = self.layer_norm(x)
|
# this can be fed to a Transformer layer
|
||||||
return x, height, width
|
embeddings = embeddings.flatten(2).transpose(1, 2)
|
||||||
|
embeddings = self.layer_norm(embeddings)
|
||||||
|
return embeddings, height, width
|
||||||
|
|
||||||
|
|
||||||
class SegformerEfficientSelfAttention(nn.Module):
|
class SegformerEfficientSelfAttention(nn.Module):
|
||||||
def __init__(self, config, hidden_size, num_attention_heads, sr_ratio):
|
"""SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
|
||||||
|
paper](https://arxiv.org/abs/2102.12122)."""
|
||||||
|
|
||||||
|
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
@@ -146,15 +137,17 @@ class SegformerEfficientSelfAttention(nn.Module):
|
|||||||
|
|
||||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||||
|
|
||||||
self.sr_ratio = sr_ratio
|
self.sr_ratio = sequence_reduction_ratio
|
||||||
if sr_ratio > 1:
|
if sequence_reduction_ratio > 1:
|
||||||
self.sr = nn.Conv2d(hidden_size, hidden_size, kernel_size=sr_ratio, stride=sr_ratio)
|
self.sr = nn.Conv2d(
|
||||||
|
hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
|
||||||
|
)
|
||||||
self.layer_norm = nn.LayerNorm(hidden_size)
|
self.layer_norm = nn.LayerNorm(hidden_size)
|
||||||
|
|
||||||
def transpose_for_scores(self, x):
|
def transpose_for_scores(self, hidden_states):
|
||||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||||
x = x.view(*new_x_shape)
|
hidden_states = hidden_states.view(*new_shape)
|
||||||
return x.permute(0, 2, 1, 3)
|
return hidden_states.permute(0, 2, 1, 3)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@@ -167,8 +160,11 @@ class SegformerEfficientSelfAttention(nn.Module):
|
|||||||
|
|
||||||
if self.sr_ratio > 1:
|
if self.sr_ratio > 1:
|
||||||
batch_size, seq_len, num_channels = hidden_states.shape
|
batch_size, seq_len, num_channels = hidden_states.shape
|
||||||
|
# Reshape to (batch_size, num_channels, height, width)
|
||||||
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
|
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
|
||||||
|
# Apply sequence reduction
|
||||||
hidden_states = self.sr(hidden_states)
|
hidden_states = self.sr(hidden_states)
|
||||||
|
# Reshape back to (batch_size, seq_len, num_channels)
|
||||||
hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
|
hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
|
||||||
hidden_states = self.layer_norm(hidden_states)
|
hidden_states = self.layer_norm(hidden_states)
|
||||||
|
|
||||||
@@ -211,10 +207,13 @@ class SegformerSelfOutput(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class SegformerAttention(nn.Module):
|
class SegformerAttention(nn.Module):
|
||||||
def __init__(self, config, hidden_size, num_attention_heads, sr_ratio):
|
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.self = SegformerEfficientSelfAttention(
|
self.self = SegformerEfficientSelfAttention(
|
||||||
config=config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio
|
config=config,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
num_attention_heads=num_attention_heads,
|
||||||
|
sequence_reduction_ratio=sequence_reduction_ratio,
|
||||||
)
|
)
|
||||||
self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
|
self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
|
||||||
self.pruned_heads = set()
|
self.pruned_heads = set()
|
||||||
@@ -285,13 +284,16 @@ class SegformerMixFFN(nn.Module):
|
|||||||
class SegformerLayer(nn.Module):
|
class SegformerLayer(nn.Module):
|
||||||
"""This corresponds to the Block class in the original implementation."""
|
"""This corresponds to the Block class in the original implementation."""
|
||||||
|
|
||||||
def __init__(self, config, hidden_size, num_attention_heads, drop_path, sr_ratio, mlp_ratio):
|
def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.layer_norm_1 = nn.LayerNorm(hidden_size)
|
self.layer_norm_1 = nn.LayerNorm(hidden_size)
|
||||||
self.attention = SegformerAttention(
|
self.attention = SegformerAttention(
|
||||||
config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio
|
config,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
num_attention_heads=num_attention_heads,
|
||||||
|
sequence_reduction_ratio=sequence_reduction_ratio,
|
||||||
)
|
)
|
||||||
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
||||||
self.layer_norm_2 = nn.LayerNorm(hidden_size)
|
self.layer_norm_2 = nn.LayerNorm(hidden_size)
|
||||||
mlp_hidden_size = int(hidden_size * mlp_ratio)
|
mlp_hidden_size = int(hidden_size * mlp_ratio)
|
||||||
self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
|
self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
|
||||||
@@ -328,14 +330,13 @@ class SegformerEncoder(nn.Module):
|
|||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
# stochastic depth decay rule
|
# stochastic depth decay rule
|
||||||
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
|
drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
|
||||||
|
|
||||||
# patch embeddings
|
# patch embeddings
|
||||||
embeddings = []
|
embeddings = []
|
||||||
for i in range(config.num_encoder_blocks):
|
for i in range(config.num_encoder_blocks):
|
||||||
embeddings.append(
|
embeddings.append(
|
||||||
SegformerOverlapPatchEmbeddings(
|
SegformerOverlapPatchEmbeddings(
|
||||||
image_size=config.image_size // config.downsampling_rates[i],
|
|
||||||
patch_size=config.patch_sizes[i],
|
patch_size=config.patch_sizes[i],
|
||||||
stride=config.strides[i],
|
stride=config.strides[i],
|
||||||
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
|
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
|
||||||
@@ -358,8 +359,8 @@ class SegformerEncoder(nn.Module):
|
|||||||
config,
|
config,
|
||||||
hidden_size=config.hidden_sizes[i],
|
hidden_size=config.hidden_sizes[i],
|
||||||
num_attention_heads=config.num_attention_heads[i],
|
num_attention_heads=config.num_attention_heads[i],
|
||||||
drop_path=dpr[cur + j],
|
drop_path=drop_path_decays[cur + j],
|
||||||
sr_ratio=config.sr_ratios[i],
|
sequence_reduction_ratio=config.sr_ratios[i],
|
||||||
mlp_ratio=config.mlp_ratios[i],
|
mlp_ratio=config.mlp_ratios[i],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user