From ac6aa10f23967373142d7a23d84a45ffd494d64b Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 4 Feb 2022 14:52:07 -0500
Subject: [PATCH] Standardize semantic segmentation models outputs (#15469)

* Standardize instance segmentation models outputs

* Rename output

* Update src/transformers/modeling_outputs.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Add legacy argument to the config and model forward

* Update src/transformers/models/beit/modeling_beit.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Copy fix in Segformer

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 docs/source/model_doc/auto.mdx                |  4 ++
 src/transformers/__init__.py                  |  2 +
 src/transformers/modeling_outputs.py          | 29 +++++++++
 src/transformers/models/auto/__init__.py      |  4 ++
 src/transformers/models/auto/modeling_auto.py | 21 +++++++
 .../models/beit/configuration_beit.py         |  6 ++
 src/transformers/models/beit/modeling_beit.py | 63 ++++++++++++++-----
 .../segformer/configuration_segformer.py      |  6 ++
 .../models/segformer/modeling_segformer.py    | 52 ++++++++++-----
 src/transformers/utils/dummy_pt_objects.py    |  7 +++
 tests/test_modeling_beit.py                   | 43 +++++++++----
 tests/test_modeling_segformer.py              | 29 +++++----
 utils/check_repo.py                           |  2 -
 13 files changed, 210 insertions(+), 58 deletions(-)

diff --git a/docs/source/model_doc/auto.mdx b/docs/source/model_doc/auto.mdx
index ab7d02d549..3588445b09 100644
--- a/docs/source/model_doc/auto.mdx
+++ b/docs/source/model_doc/auto.mdx
@@ -150,6 +150,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 
 [[autodoc]] AutoModelForImageSegmentation
 
+## AutoModelForSemanticSegmentation
+
+[[autodoc]] AutoModelForSemanticSegmentation
+
 ## TFAutoModel
 
 [[autodoc]] TFAutoModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0a133f71eb..817ecfa375 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -688,6 +688,7 @@ if is_torch_available():
             "AutoModelForObjectDetection",
             "AutoModelForPreTraining",
             "AutoModelForQuestionAnswering",
+            "AutoModelForSemanticSegmentation",
             "AutoModelForSeq2SeqLM",
             "AutoModelForSequenceClassification",
             "AutoModelForSpeechSeq2Seq",
@@ -2797,6 +2798,7 @@ if TYPE_CHECKING:
             AutoModelForObjectDetection,
             AutoModelForPreTraining,
             AutoModelForQuestionAnswering,
+            AutoModelForSemanticSegmentation,
             AutoModelForSeq2SeqLM,
             AutoModelForSequenceClassification,
             AutoModelForSpeechSeq2Seq,
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index c7f4a27fb3..b0f3e01861 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -812,3 +812,32 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class SemanticSegmentationModelOutput(ModelOutput):
+    """
+    Base class for outputs of semantic segmentation models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, height, width)`):
+            Classification scores for each pixel.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, patch_size, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 89d1c73b77..7798a51a8d 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -43,6 +43,7 @@ if is_torch_available():
         "MODEL_FOR_OBJECT_DETECTION_MAPPING",
         "MODEL_FOR_PRETRAINING_MAPPING",
         "MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+        "MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
         "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
         "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
@@ -65,6 +66,7 @@ if is_torch_available():
         "AutoModelForObjectDetection",
         "AutoModelForPreTraining",
         "AutoModelForQuestionAnswering",
+        "AutoModelForSemanticSegmentation",
         "AutoModelForSeq2SeqLM",
         "AutoModelForSequenceClassification",
         "AutoModelForSpeechSeq2Seq",
@@ -155,6 +157,7 @@ if TYPE_CHECKING:
             MODEL_FOR_OBJECT_DETECTION_MAPPING,
             MODEL_FOR_PRETRAINING_MAPPING,
             MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
             MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
             MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
@@ -177,6 +180,7 @@ if TYPE_CHECKING:
             AutoModelForObjectDetection,
             AutoModelForPreTraining,
             AutoModelForQuestionAnswering,
+            AutoModelForSemanticSegmentation,
             AutoModelForSeq2SeqLM,
             AutoModelForSequenceClassification,
             AutoModelForSpeechSeq2Seq,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 94909cfde5..7c412fffff 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -278,11 +278,20 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
 
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = OrderedDict(
     [
+        # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
         ("detr", "DetrForSegmentation"),
     ]
 )
 
+MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Semantic Segmentation mapping
+        ("beit", "BeitForSemanticSegmentation"),
+        ("segformer", "SegformerForSemanticSegmentation"),
+    ]
+)
+
 MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
@@ -603,6 +612,9 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES
 )
+MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES
+)
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
 MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
 MODEL_FOR_OBJECT_DETECTION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES)
@@ -745,6 +757,15 @@ class AutoModelForImageSegmentation(_BaseAutoModelClass):
 AutoModelForImageSegmentation = auto_class_update(AutoModelForImageSegmentation, head_doc="image segmentation")
 
 
+class AutoModelForSemanticSegmentation(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
+
+
+AutoModelForSemanticSegmentation = auto_class_update(
+    AutoModelForSemanticSegmentation, head_doc="semantic segmentation"
+)
+
+
 class AutoModelForObjectDetection(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index b668a2ed8c..ebc46ee313 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -93,6 +93,10 @@ class BeitConfig(PretrainedConfig):
             Whether to concatenate the output of the auxiliary head with the input before the classification layer.
         semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
             The index that is ignored by the loss function of the semantic segmentation model.
+        legacy_output (`bool`, *optional*, defaults to `False`):
+            Whether to return the legacy outputs or not (with logits of shape `height / 4 , width / 4`)
+
+            This argument is only present for backward compatibility reasons and will be removed in v5 of Transformers.
 
     Example:
 
@@ -141,6 +145,7 @@ class BeitConfig(PretrainedConfig):
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         semantic_loss_ignore_index=255,
+        legacy_output=False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -176,3 +181,4 @@ class BeitConfig(PretrainedConfig):
         self.auxiliary_num_convs = auxiliary_num_convs
         self.auxiliary_concat_input = auxiliary_concat_input
         self.semantic_loss_ignore_index = semantic_loss_ignore_index
+        self.legacy_output = legacy_output
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 295d91f7e2..d42e4abc8e 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -17,6 +17,7 @@
 
 import collections.abc
 import math
+import warnings
 from dataclasses import dataclass
 
 import torch
@@ -31,7 +32,13 @@ from ...file_utils import (
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    SemanticSegmentationModelOutput,
+    SequenceClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_beit import BeitConfig
@@ -1114,11 +1121,8 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def compute_loss(self, logits, auxiliary_logits, labels):
+    def compute_loss(self, upsampled_logits, auxiliary_logits, labels):
         # upsample logits to the images' original size
-        upsampled_logits = nn.functional.interpolate(
-            logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
-        )
         if auxiliary_logits is not None:
             upsampled_auxiliary_logits = nn.functional.interpolate(
                 auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
@@ -1132,7 +1136,7 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
         return loss
 
     @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=SemanticSegmentationModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values=None,
@@ -1141,11 +1145,17 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        legacy_output=None,
     ):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+        legacy_output (`bool`, *optional*):
+            Whether to return the legacy outputs or not (with logits of shape `height / 4 , width / 4`). Will default
+            to `self.config.legacy_output`.
+
+            This argument is only present for backward compatibility reasons and will be removed in v5 of Transformers.
 
         Returns:
 
@@ -1164,13 +1174,21 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
+        >>> # logits are of shape (batch_size, num_labels, height, width)
         >>> logits = outputs.logits
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        legacy_output = legacy_output if legacy_output is not None else self.config.legacy_output
+        if not legacy_output:
+            warnings.warn(
+                "The output of this model has changed in v4.17.0 and the logits now have the same size as the inputs. "
+                "You can activate the previous behavior by passing `legacy_output=True` to this call or the "
+                "configuration of this model (only until v5, then that argument will be removed).",
+                FutureWarning,
+            )
 
         outputs = self.beit(
             pixel_values,
@@ -1197,6 +1215,11 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
             features[i] = ops[i](features[i])
 
         logits = self.decode_head(features)
+
+        upsampled_logits = nn.functional.interpolate(
+            logits, size=pixel_values.shape[-2:], mode="bilinear", align_corners=False
+        )
+
         auxiliary_logits = None
         if self.auxiliary_head is not None:
             auxiliary_logits = self.auxiliary_head(features)
@@ -1206,18 +1229,26 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
             if self.config.num_labels == 1:
                 raise ValueError("The number of labels should be greater than one")
             else:
-                loss = self.compute_loss(logits, auxiliary_logits, labels)
+                loss = self.compute_loss(upsampled_logits, auxiliary_logits, labels)
 
         if not return_dict:
             if output_hidden_states:
-                output = (logits,) + outputs[2:]
+                output = (logits if legacy_output else upsampled_logits,) + outputs[2:]
             else:
-                output = (logits,) + outputs[3:]
+                output = (logits if legacy_output else upsampled_logits,) + outputs[3:]
             return ((loss,) + output) if loss is not None else output
 
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
-            attentions=outputs.attentions,
-        )
+        if legacy_output:
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states if output_hidden_states else None,
+                attentions=outputs.attentions,
+            )
+        else:
+            return SemanticSegmentationModelOutput(
+                loss=loss,
+                logits=upsampled_logits,
+                hidden_states=outputs.hidden_states if output_hidden_states else None,
+                attentions=outputs.attentions,
+            )
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index d1790634e6..0e0b3a9914 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -83,6 +83,10 @@ class SegformerConfig(PretrainedConfig):
             required for the semantic segmentation model.
         semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
             The index that is ignored by the loss function of the semantic segmentation model.
+        legacy_output (`bool`, *optional*, defaults to `False`):
+            Whether to return the legacy outputs or not (with logits of shape `height / 4 , width / 4`)
+
+            This argument is only present for backward compatibility reasons and will be removed in v5 of Transformers.
 
     Example:
 
@@ -124,6 +128,7 @@ class SegformerConfig(PretrainedConfig):
         is_encoder_decoder=False,
         reshape_last_stage=True,
         semantic_loss_ignore_index=255,
+        legacy_output=False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -149,3 +154,4 @@ class SegformerConfig(PretrainedConfig):
         self.decoder_hidden_size = decoder_hidden_size
         self.reshape_last_stage = reshape_last_stage
         self.semantic_loss_ignore_index = semantic_loss_ignore_index
+        self.legacy_output = legacy_output
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 6f0a4a1d78..196d741ce7 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -17,6 +17,7 @@
 
 import collections
 import math
+import warnings
 
 import torch
 import torch.utils.checkpoint
@@ -30,7 +31,7 @@ from ...file_utils import (
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
+from ...modeling_outputs import BaseModelOutput, SemanticSegmentationModelOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_segformer import SegformerConfig
@@ -688,7 +689,7 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=SemanticSegmentationModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values,
@@ -696,11 +697,17 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        legacy_output=None,
     ):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+        legacy_output (`bool`, *optional*):
+            Whether to return the legacy outputs or not (with logits of shape `height / 4 , width / 4`). Will default
+            to `self.config.legacy_output`.
+
+            This argument is only present for backward compatibility reasons and will be removed in v5 of Transformers.
 
         Returns:
 
@@ -719,12 +726,20 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)
+        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        legacy_output = legacy_output if legacy_output is not None else self.config.legacy_output
+        if not legacy_output:
+            warnings.warn(
+                "The output of this model has changed in v4.17.0 and the logits now have the same size as the inputs. "
+                "You can activate the previous behavior by passing `legacy_output=True` to this call or the "
+                "configuration of this model (only until v5, then that argument will be removed).",
+                FutureWarning,
+            )
 
         outputs = self.segformer(
             pixel_values,
@@ -737,28 +752,37 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
 
         logits = self.decode_head(encoder_hidden_states)
 
+        upsampled_logits = nn.functional.interpolate(
+            logits, size=pixel_values.shape[-2:], mode="bilinear", align_corners=False
+        )
+
         loss = None
         if labels is not None:
             if self.config.num_labels == 1:
                 raise ValueError("The number of labels should be greater than one")
             else:
                 # upsample logits to the images' original size
-                upsampled_logits = nn.functional.interpolate(
-                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
-                )
                 loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
                 loss = loss_fct(upsampled_logits, labels)
 
         if not return_dict:
             if output_hidden_states:
-                output = (logits,) + outputs[1:]
+                output = (logits if legacy_output else upsampled_logits,) + outputs[1:]
             else:
-                output = (logits,) + outputs[2:]
+                output = (logits if legacy_output else upsampled_logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
-            attentions=outputs.attentions,
-        )
+        if legacy_output:
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states if output_hidden_states else None,
+                attentions=outputs.attentions,
+            )
+        else:
+            return SemanticSegmentationModelOutput(
+                loss=loss,
+                logits=upsampled_logits,
+                hidden_states=outputs.hidden_states if output_hidden_states else None,
+                attentions=outputs.attentions,
+            )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 21e94075a8..71e5bd291f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -474,6 +474,13 @@ class AutoModelForQuestionAnswering(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForSemanticSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForSeq2SeqLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/test_modeling_beit.py b/tests/test_modeling_beit.py
index 777247161a..2304678e79 100644
--- a/tests/test_modeling_beit.py
+++ b/tests/test_modeling_beit.py
@@ -92,17 +92,20 @@ class BeitModelTester:
         self.initializer_range = initializer_range
         self.scope = scope
         self.out_indices = out_indices
+        self.num_labels = num_labels
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
         labels = None
+        pixel_labels = None
         if self.use_labels:
             labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
 
         config = self.get_config()
 
-        return config, pixel_values, labels
+        return config, pixel_values, labels, pixel_labels
 
     def get_config(self):
         return BeitConfig(
@@ -122,7 +125,7 @@ class BeitModelTester:
             out_indices=self.out_indices,
         )
 
-    def create_and_check_model(self, config, pixel_values, labels):
+    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
         model = BeitModel(config=config)
         model.to(torch_device)
         model.eval()
@@ -133,7 +136,7 @@ class BeitModelTester:
         num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
 
-    def create_and_check_for_masked_lm(self, config, pixel_values, labels):
+    def create_and_check_for_masked_lm(self, config, pixel_values, labels, pixel_labels):
         model = BeitForMaskedImageModeling(config=config)
         model.to(torch_device)
         model.eval()
@@ -144,7 +147,7 @@ class BeitModelTester:
         num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, self.vocab_size))
 
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
         config.num_labels = self.type_sequence_label_size
         model = BeitForImageClassification(config)
         model.to(torch_device)
@@ -152,13 +155,23 @@ class BeitModelTester:
         result = model(pixel_values, labels=labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
+    def create_and_check_for_image_segmentation(self, config, pixel_values, labels, pixel_labels):
+        config.num_labels = self.num_labels
+        model = BeitForSemanticSegmentation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
+        )
+        result = model(pixel_values, labels=pixel_labels)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
+        )
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
+        config, pixel_values, labels, pixel_labels = config_and_inputs
         inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
@@ -217,6 +230,10 @@ class BeitModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_for_image_segmentation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
+
     def test_training(self):
         if not self.model_tester.is_training:
             return
@@ -516,14 +533,14 @@ class BeitModelIntegrationTest(unittest.TestCase):
         logits = outputs.logits
 
         # verify the logits
-        expected_shape = torch.Size((1, 150, 160, 160))
+        expected_shape = torch.Size((1, 150, 640, 640))
         self.assertEqual(logits.shape, expected_shape)
 
         expected_slice = torch.tensor(
             [
-                [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]],
-                [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]],
-                [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]],
+                [[-4.9225, -4.9225, -4.6066], [-4.9225, -4.9225, -4.6066], [-4.6675, -4.6675, -4.3617]],
+                [[-5.8168, -5.8168, -5.5163], [-5.8168, -5.8168, -5.5163], [-5.5728, -5.5728, -5.2842]],
+                [[-0.0078, -0.0078, 0.4926], [-0.0078, -0.0078, 0.4926], [0.3664, 0.3664, 0.8309]],
             ]
         ).to(torch_device)
 
diff --git a/tests/test_modeling_segformer.py b/tests/test_modeling_segformer.py
index 2e84b1f41d..3bb2930655 100644
--- a/tests/test_modeling_segformer.py
+++ b/tests/test_modeling_segformer.py
@@ -133,11 +133,11 @@ class SegformerModelTester:
         model.eval()
         result = model(pixel_values)
         self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
         )
         result = model(pixel_values, labels=labels)
         self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -245,6 +245,7 @@ class SegformerModelTest(ModelTesterMixin, unittest.TestCase):
                 list(attentions[-1].shape[-3:]),
                 [self.model_tester.num_attention_heads[-1], expected_seq_len, expected_reduced_seq_len],
             )
+            out_len = len(outputs)
 
             # Check attention is always last and order is fine
             inputs_dict["output_attentions"] = True
@@ -255,7 +256,7 @@ class SegformerModelTest(ModelTesterMixin, unittest.TestCase):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            self.assertEqual(3, len(outputs))
+            self.assertEqual(out_len + 1, len(outputs))
 
             self_attentions = outputs.attentions
 
@@ -357,16 +358,17 @@ class SegformerModelIntegrationTest(unittest.TestCase):
         encoded_inputs = feature_extractor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
-        outputs = model(pixel_values)
+        with torch.no_grad():
+            outputs = model(pixel_values)
 
-        expected_shape = torch.Size((1, model.config.num_labels, 128, 128))
+        expected_shape = torch.Size((1, model.config.num_labels, 512, 512))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor(
             [
-                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
+                [[-4.6309, -4.6309, -4.7425], [-4.6309, -4.6309, -4.7425], [-4.7011, -4.7011, -4.8136]],
+                [[-12.1391, -12.1391, -12.2858], [-12.1391, -12.1391, -12.2858], [-12.2309, -12.2309, -12.3758]],
+                [[-12.5134, -12.5134, -12.6328], [-12.5134, -12.5134, -12.6328], [-12.5576, -12.5576, -12.6865]],
             ]
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4))
@@ -385,16 +387,17 @@ class SegformerModelIntegrationTest(unittest.TestCase):
         encoded_inputs = feature_extractor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
-        outputs = model(pixel_values)
+        with torch.no_grad():
+            outputs = model(pixel_values)
 
-        expected_shape = torch.Size((1, model.config.num_labels, 128, 128))
+        expected_shape = torch.Size((1, model.config.num_labels, 512, 512))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice = torch.tensor(
             [
-                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
-                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
-                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
+                [[-13.5729, -13.5729, -13.6149], [-13.5729, -13.5729, -13.6149], [-13.6697, -13.6697, -13.7224]],
+                [[-17.1638, -17.1638, -17.0022], [-17.1638, -17.1638, -17.0022], [-17.1754, -17.1754, -17.0358]],
+                [[-3.6452, -3.6452, -3.5670], [-3.6452, -3.6452, -3.5670], [-3.5744, -3.5744, -3.5079]],
             ]
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-1))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index a583196581..9ee2266ca7 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -118,8 +118,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     "PerceiverForMultimodalAutoencoding",
     "PerceiverForOpticalFlow",
     "SegformerDecodeHead",
-    "SegformerForSemanticSegmentation",
-    "BeitForSemanticSegmentation",
     "FlaxBeitForMaskedImageModeling",
     "BeitForMaskedImageModeling",
     "CLIPTextModel",