Add segmentation + object detection image processors (#20160)
* Add transforms for object detection * DETR models + Yolos * Scrappy additions * Maskformer image processor * Fix up; MaskFormer tests * Update owlvit processor * Add to docs * OwlViT tests * Update pad logic * Remove changes to transforms * Import fn directly * Update to include pad transformation * Remove uninstended changes * Add new owlvit post processing function * Tidy up * Fix copies * Fix some copies * Include device fix * Fix scipy imports * Update _pad_image * Update padding functionality * Fix bug * Properly handle ignore index * Fix up * Remove defaults to None in docstrings * Fix docstrings & docs * Fix sizes bug * Resolve conflicts in init * Cast to float after resizing * Tidy & add size if missing * Allow kwards when processing for owlvit * Update test values
This commit is contained in:
@@ -32,6 +32,16 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o
|
|||||||
|
|
||||||
[[autodoc]] ConditionalDetrConfig
|
[[autodoc]] ConditionalDetrConfig
|
||||||
|
|
||||||
|
## ConditionalDetrImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] ConditionalDetrImageProcessor
|
||||||
|
- preprocess
|
||||||
|
- pad_and_create_pixel_mask
|
||||||
|
- post_process_object_detection
|
||||||
|
- post_process_instance_segmentation
|
||||||
|
- post_process_semantic_segmentation
|
||||||
|
- post_process_panoptic_segmentation
|
||||||
|
|
||||||
## ConditionalDetrFeatureExtractor
|
## ConditionalDetrFeatureExtractor
|
||||||
|
|
||||||
[[autodoc]] ConditionalDetrFeatureExtractor
|
[[autodoc]] ConditionalDetrFeatureExtractor
|
||||||
|
|||||||
@@ -33,6 +33,13 @@ alt="drawing" width="600"/>
|
|||||||
|
|
||||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/fundamentalvision/Deformable-DETR).
|
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/fundamentalvision/Deformable-DETR).
|
||||||
|
|
||||||
|
## DeformableDetrImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] DeformableDetrImageProcessor
|
||||||
|
- preprocess
|
||||||
|
- pad_and_create_pixel_mask
|
||||||
|
- post_process_object_detection
|
||||||
|
|
||||||
## DeformableDetrFeatureExtractor
|
## DeformableDetrFeatureExtractor
|
||||||
|
|
||||||
[[autodoc]] DeformableDetrFeatureExtractor
|
[[autodoc]] DeformableDetrFeatureExtractor
|
||||||
|
|||||||
@@ -166,6 +166,15 @@ mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are i
|
|||||||
|
|
||||||
[[autodoc]] DetrConfig
|
[[autodoc]] DetrConfig
|
||||||
|
|
||||||
|
## DetrImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] DetrImageProcessor
|
||||||
|
- preprocess
|
||||||
|
- post_process_object_detection
|
||||||
|
- post_process_semantic_segmentation
|
||||||
|
- post_process_instance_segmentation
|
||||||
|
- post_process_panoptic_segmentation
|
||||||
|
|
||||||
## DetrFeatureExtractor
|
## DetrFeatureExtractor
|
||||||
|
|
||||||
[[autodoc]] DetrFeatureExtractor
|
[[autodoc]] DetrFeatureExtractor
|
||||||
|
|||||||
@@ -57,6 +57,15 @@ This model was contributed by [francesco](https://huggingface.co/francesco). The
|
|||||||
|
|
||||||
[[autodoc]] MaskFormerConfig
|
[[autodoc]] MaskFormerConfig
|
||||||
|
|
||||||
|
## MaskFormerImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] MaskFormerImageProcessor
|
||||||
|
- preprocess
|
||||||
|
- encode_inputs
|
||||||
|
- post_process_semantic_segmentation
|
||||||
|
- post_process_instance_segmentation
|
||||||
|
- post_process_panoptic_segmentation
|
||||||
|
|
||||||
## MaskFormerFeatureExtractor
|
## MaskFormerFeatureExtractor
|
||||||
|
|
||||||
[[autodoc]] MaskFormerFeatureExtractor
|
[[autodoc]] MaskFormerFeatureExtractor
|
||||||
|
|||||||
@@ -76,6 +76,13 @@ This model was contributed by [adirik](https://huggingface.co/adirik). The origi
|
|||||||
|
|
||||||
[[autodoc]] OwlViTVisionConfig
|
[[autodoc]] OwlViTVisionConfig
|
||||||
|
|
||||||
|
## OwlViTImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] OwlViTImageProcessor
|
||||||
|
- preprocess
|
||||||
|
- post_process
|
||||||
|
- post_process_image_guided_detection
|
||||||
|
|
||||||
## OwlViTFeatureExtractor
|
## OwlViTFeatureExtractor
|
||||||
|
|
||||||
[[autodoc]] OwlViTFeatureExtractor
|
[[autodoc]] OwlViTFeatureExtractor
|
||||||
|
|||||||
@@ -37,6 +37,12 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
|
|||||||
|
|
||||||
[[autodoc]] YolosConfig
|
[[autodoc]] YolosConfig
|
||||||
|
|
||||||
|
## YolosImageProcessor
|
||||||
|
|
||||||
|
[[autodoc]] YolosImageProcessor
|
||||||
|
- preprocess
|
||||||
|
- pad
|
||||||
|
- post_process_object_detection
|
||||||
|
|
||||||
## YolosFeatureExtractor
|
## YolosFeatureExtractor
|
||||||
|
|
||||||
|
|||||||
@@ -736,11 +736,15 @@ else:
|
|||||||
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
|
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
|
||||||
_import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
|
_import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
|
||||||
_import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"])
|
_import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"])
|
||||||
_import_structure["models.conditional_detr"].append("ConditionalDetrFeatureExtractor")
|
_import_structure["models.conditional_detr"].extend(
|
||||||
|
["ConditionalDetrFeatureExtractor", "ConditionalDetrImageProcessor"]
|
||||||
|
)
|
||||||
_import_structure["models.convnext"].extend(["ConvNextFeatureExtractor", "ConvNextImageProcessor"])
|
_import_structure["models.convnext"].extend(["ConvNextFeatureExtractor", "ConvNextImageProcessor"])
|
||||||
_import_structure["models.deformable_detr"].append("DeformableDetrFeatureExtractor")
|
_import_structure["models.deformable_detr"].extend(
|
||||||
|
["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
|
||||||
|
)
|
||||||
_import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
|
_import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
|
||||||
_import_structure["models.detr"].append("DetrFeatureExtractor")
|
_import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"])
|
||||||
_import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
|
_import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
|
||||||
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
|
_import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
|
||||||
_import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
|
_import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
|
||||||
@@ -749,18 +753,18 @@ else:
|
|||||||
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
|
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
|
||||||
_import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
|
_import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
|
||||||
_import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
|
_import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
|
||||||
_import_structure["models.maskformer"].append("MaskFormerFeatureExtractor")
|
_import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"])
|
||||||
_import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"])
|
_import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"])
|
||||||
_import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"])
|
_import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"])
|
||||||
_import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
|
_import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
|
||||||
_import_structure["models.owlvit"].append("OwlViTFeatureExtractor")
|
_import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
|
||||||
_import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
|
_import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
|
||||||
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
|
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
|
||||||
_import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
|
_import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
|
||||||
_import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
|
_import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
|
||||||
_import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
|
_import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
|
||||||
_import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
|
_import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
|
||||||
_import_structure["models.yolos"].extend(["YolosFeatureExtractor"])
|
_import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
|
||||||
|
|
||||||
# Timm-backed objects
|
# Timm-backed objects
|
||||||
try:
|
try:
|
||||||
@@ -3869,11 +3873,11 @@ if TYPE_CHECKING:
|
|||||||
from .image_utils import ImageFeatureExtractionMixin
|
from .image_utils import ImageFeatureExtractionMixin
|
||||||
from .models.beit import BeitFeatureExtractor, BeitImageProcessor
|
from .models.beit import BeitFeatureExtractor, BeitImageProcessor
|
||||||
from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
|
from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
|
||||||
from .models.conditional_detr import ConditionalDetrFeatureExtractor
|
from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor
|
||||||
from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
|
from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
|
||||||
from .models.deformable_detr import DeformableDetrFeatureExtractor
|
from .models.deformable_detr import DeformableDetrFeatureExtractor, DeformableDetrImageProcessor
|
||||||
from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
|
from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
|
||||||
from .models.detr import DetrFeatureExtractor
|
from .models.detr import DetrFeatureExtractor, DetrImageProcessor
|
||||||
from .models.donut import DonutFeatureExtractor, DonutImageProcessor
|
from .models.donut import DonutFeatureExtractor, DonutImageProcessor
|
||||||
from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
|
from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
|
||||||
from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
|
from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
|
||||||
@@ -3882,18 +3886,18 @@ if TYPE_CHECKING:
|
|||||||
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
|
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
|
||||||
from .models.layoutlmv3 import LayoutLMv3FeatureExtractor, LayoutLMv3ImageProcessor
|
from .models.layoutlmv3 import LayoutLMv3FeatureExtractor, LayoutLMv3ImageProcessor
|
||||||
from .models.levit import LevitFeatureExtractor, LevitImageProcessor
|
from .models.levit import LevitFeatureExtractor, LevitImageProcessor
|
||||||
from .models.maskformer import MaskFormerFeatureExtractor
|
from .models.maskformer import MaskFormerFeatureExtractor, MaskFormerImageProcessor
|
||||||
from .models.mobilenet_v1 import MobileNetV1FeatureExtractor, MobileNetV1ImageProcessor
|
from .models.mobilenet_v1 import MobileNetV1FeatureExtractor, MobileNetV1ImageProcessor
|
||||||
from .models.mobilenet_v2 import MobileNetV2FeatureExtractor, MobileNetV2ImageProcessor
|
from .models.mobilenet_v2 import MobileNetV2FeatureExtractor, MobileNetV2ImageProcessor
|
||||||
from .models.mobilevit import MobileViTFeatureExtractor, MobileViTImageProcessor
|
from .models.mobilevit import MobileViTFeatureExtractor, MobileViTImageProcessor
|
||||||
from .models.owlvit import OwlViTFeatureExtractor
|
from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor
|
||||||
from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
|
from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
|
||||||
from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
|
from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
|
||||||
from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
|
from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
|
||||||
from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
|
from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
|
||||||
from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
|
from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
|
||||||
from .models.vit import ViTFeatureExtractor, ViTImageProcessor
|
from .models.vit import ViTFeatureExtractor, ViTImageProcessor
|
||||||
from .models.yolos import YolosFeatureExtractor
|
from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
|
||||||
|
|
||||||
# Modeling
|
# Modeling
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import TYPE_CHECKING, List, Tuple, Union
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from packaging import version
|
from packaging import version
|
||||||
@@ -163,6 +163,47 @@ def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> T
|
|||||||
raise ValueError(f"Unsupported data format: {channel_dim}")
|
raise ValueError(f"Unsupported data format: {channel_dim}")
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
|
||||||
|
if (
|
||||||
|
isinstance(annotation, dict)
|
||||||
|
and "image_id" in annotation
|
||||||
|
and "annotations" in annotation
|
||||||
|
and isinstance(annotation["annotations"], (list, tuple))
|
||||||
|
and (
|
||||||
|
# an image can have no annotations
|
||||||
|
len(annotation["annotations"]) == 0
|
||||||
|
or isinstance(annotation["annotations"][0], dict)
|
||||||
|
)
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
|
||||||
|
if (
|
||||||
|
isinstance(annotation, dict)
|
||||||
|
and "image_id" in annotation
|
||||||
|
and "segments_info" in annotation
|
||||||
|
and "file_name" in annotation
|
||||||
|
and isinstance(annotation["segments_info"], (list, tuple))
|
||||||
|
and (
|
||||||
|
# an image can have no segments
|
||||||
|
len(annotation["segments_info"]) == 0
|
||||||
|
or isinstance(annotation["segments_info"][0], dict)
|
||||||
|
)
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
|
||||||
|
return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
|
||||||
|
|
||||||
|
|
||||||
|
def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
|
||||||
|
return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
|
||||||
|
|
||||||
|
|
||||||
def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
|
def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
|
||||||
"""
|
"""
|
||||||
Loads `image` to a PIL Image.
|
Loads `image` to a PIL Image.
|
||||||
|
|||||||
@@ -39,10 +39,14 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
[
|
[
|
||||||
("beit", "BeitImageProcessor"),
|
("beit", "BeitImageProcessor"),
|
||||||
("clip", "CLIPImageProcessor"),
|
("clip", "CLIPImageProcessor"),
|
||||||
|
("clipseg", "ViTImageProcessor"),
|
||||||
|
("conditional_detr", "ConditionalDetrImageProcessor"),
|
||||||
("convnext", "ConvNextImageProcessor"),
|
("convnext", "ConvNextImageProcessor"),
|
||||||
("cvt", "ConvNextImageProcessor"),
|
("cvt", "ConvNextImageProcessor"),
|
||||||
("data2vec-vision", "BeitImageProcessor"),
|
("data2vec-vision", "BeitImageProcessor"),
|
||||||
|
("deformable_detr", "DeformableDetrImageProcessor"),
|
||||||
("deit", "DeiTImageProcessor"),
|
("deit", "DeiTImageProcessor"),
|
||||||
|
("detr", "DetrImageProcessor"),
|
||||||
("dinat", "ViTImageProcessor"),
|
("dinat", "ViTImageProcessor"),
|
||||||
("donut-swin", "DonutImageProcessor"),
|
("donut-swin", "DonutImageProcessor"),
|
||||||
("dpt", "DPTImageProcessor"),
|
("dpt", "DPTImageProcessor"),
|
||||||
@@ -53,10 +57,14 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("layoutlmv2", "LayoutLMv2ImageProcessor"),
|
("layoutlmv2", "LayoutLMv2ImageProcessor"),
|
||||||
("layoutlmv3", "LayoutLMv3ImageProcessor"),
|
("layoutlmv3", "LayoutLMv3ImageProcessor"),
|
||||||
("levit", "LevitImageProcessor"),
|
("levit", "LevitImageProcessor"),
|
||||||
|
("maskformer", "MaskFormerImageProcessor"),
|
||||||
("mobilenet_v1", "MobileNetV1ImageProcessor"),
|
("mobilenet_v1", "MobileNetV1ImageProcessor"),
|
||||||
("mobilenet_v2", "MobileNetV2ImageProcessor"),
|
("mobilenet_v2", "MobileNetV2ImageProcessor"),
|
||||||
|
("mobilenet_v2", "MobileNetV2ImageProcessor"),
|
||||||
|
("mobilevit", "MobileViTImageProcessor"),
|
||||||
("mobilevit", "MobileViTImageProcessor"),
|
("mobilevit", "MobileViTImageProcessor"),
|
||||||
("nat", "ViTImageProcessor"),
|
("nat", "ViTImageProcessor"),
|
||||||
|
("owlvit", "OwlViTImageProcessor"),
|
||||||
("perceiver", "PerceiverImageProcessor"),
|
("perceiver", "PerceiverImageProcessor"),
|
||||||
("poolformer", "PoolFormerImageProcessor"),
|
("poolformer", "PoolFormerImageProcessor"),
|
||||||
("regnet", "ConvNextImageProcessor"),
|
("regnet", "ConvNextImageProcessor"),
|
||||||
@@ -64,6 +72,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("segformer", "SegformerImageProcessor"),
|
("segformer", "SegformerImageProcessor"),
|
||||||
("swin", "ViTImageProcessor"),
|
("swin", "ViTImageProcessor"),
|
||||||
("swinv2", "ViTImageProcessor"),
|
("swinv2", "ViTImageProcessor"),
|
||||||
|
("table-transformer", "DetrImageProcessor"),
|
||||||
("van", "ConvNextImageProcessor"),
|
("van", "ConvNextImageProcessor"),
|
||||||
("videomae", "VideoMAEImageProcessor"),
|
("videomae", "VideoMAEImageProcessor"),
|
||||||
("vilt", "ViltImageProcessor"),
|
("vilt", "ViltImageProcessor"),
|
||||||
@@ -71,6 +80,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("vit_mae", "ViTImageProcessor"),
|
("vit_mae", "ViTImageProcessor"),
|
||||||
("vit_msn", "ViTImageProcessor"),
|
("vit_msn", "ViTImageProcessor"),
|
||||||
("xclip", "CLIPImageProcessor"),
|
("xclip", "CLIPImageProcessor"),
|
||||||
|
("yolos", "YolosImageProcessor"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -113,7 +123,7 @@ def get_image_processor_config(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Loads the image processor configuration from a pretrained model imag processor configuration. # FIXME
|
Loads the image processor configuration from a pretrained model image processor configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ except OptionalDependencyNotAvailable:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
|
_import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
|
||||||
|
_import_structure["image_processing_conditional_detr"] = ["ConditionalDetrImageProcessor"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_timm_available():
|
if not is_timm_available():
|
||||||
@@ -66,6 +67,7 @@ if TYPE_CHECKING:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
|
from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
|
||||||
|
from .image_processing_conditional_detr import ConditionalDetrImageProcessor
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_timm_available():
|
if not is_timm_available():
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -32,6 +32,7 @@ except OptionalDependencyNotAvailable:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
|
_import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
|
||||||
|
_import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_timm_available():
|
if not is_timm_available():
|
||||||
@@ -57,6 +58,7 @@ if TYPE_CHECKING:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
|
from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
|
||||||
|
from .image_processing_deformable_detr import DeformableDetrImageProcessor
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_timm_available():
|
if not is_timm_available():
|
||||||
|
|||||||
@@ -14,729 +14,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Feature extractor class for Deformable DETR."""
|
"""Feature extractor class for Deformable DETR."""
|
||||||
|
|
||||||
import pathlib
|
from ...utils import logging
|
||||||
import warnings
|
from .image_processing_deformable_detr import DeformableDetrImageProcessor
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
|
|
||||||
from ...image_transforms import center_to_corners_format, corners_to_center_format, rgb_to_id
|
|
||||||
from ...image_utils import ImageFeatureExtractionMixin
|
|
||||||
from ...utils import TensorType, is_torch_available, is_torch_tensor, logging
|
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
|
||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
DeformableDetrFeatureExtractor = DeformableDetrImageProcessor
|
||||||
ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes
|
|
||||||
def masks_to_boxes(masks):
|
|
||||||
"""
|
|
||||||
Compute the bounding boxes around the provided panoptic segmentation masks.
|
|
||||||
|
|
||||||
The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
|
|
||||||
|
|
||||||
Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
|
|
||||||
"""
|
|
||||||
if masks.size == 0:
|
|
||||||
return np.zeros((0, 4))
|
|
||||||
|
|
||||||
h, w = masks.shape[-2:]
|
|
||||||
|
|
||||||
y = np.arange(0, h, dtype=np.float32)
|
|
||||||
x = np.arange(0, w, dtype=np.float32)
|
|
||||||
# see https://github.com/pytorch/pytorch/issues/50276
|
|
||||||
y, x = np.meshgrid(y, x, indexing="ij")
|
|
||||||
|
|
||||||
x_mask = masks * np.expand_dims(x, axis=0)
|
|
||||||
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
|
|
||||||
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
|
|
||||||
x_min = x.filled(fill_value=1e8)
|
|
||||||
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
|
|
||||||
|
|
||||||
y_mask = masks * np.expand_dims(y, axis=0)
|
|
||||||
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
|
|
||||||
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
|
|
||||||
y_min = y.filled(fill_value=1e8)
|
|
||||||
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
|
|
||||||
|
|
||||||
return np.stack([x_min, y_min, x_max, y_max], 1)
|
|
||||||
|
|
||||||
|
|
||||||
class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
|
|
||||||
r"""
|
|
||||||
Constructs a Deformable DETR feature extractor. Differs only in the postprocessing of object detection compared to
|
|
||||||
DETR.
|
|
||||||
|
|
||||||
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
|
|
||||||
should refer to this superclass for more information regarding those methods.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
format (`str`, *optional*, defaults to `"coco_detection"`):
|
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to resize the input to a certain `size`.
|
|
||||||
size (`int`, *optional*, defaults to 800):
|
|
||||||
Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
|
|
||||||
sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
|
|
||||||
the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
|
|
||||||
height / width, size)`.
|
|
||||||
max_size (`int`, *optional*, defaults to 1333):
|
|
||||||
The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
|
|
||||||
set to `True`.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to normalize the input with mean and standard deviation.
|
|
||||||
image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
|
|
||||||
The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
|
|
||||||
image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
|
|
||||||
The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
|
|
||||||
ImageNet std.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
format="coco_detection",
|
|
||||||
do_resize=True,
|
|
||||||
size=800,
|
|
||||||
max_size=1333,
|
|
||||||
do_normalize=True,
|
|
||||||
image_mean=None,
|
|
||||||
image_std=None,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.format = self._is_valid_format(format)
|
|
||||||
self.do_resize = do_resize
|
|
||||||
self.size = size
|
|
||||||
self.max_size = max_size
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406] # ImageNet mean
|
|
||||||
self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225] # ImageNet std
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._is_valid_format
|
|
||||||
def _is_valid_format(self, format):
|
|
||||||
if format not in ["coco_detection", "coco_panoptic"]:
|
|
||||||
raise ValueError(f"Format {format} not supported")
|
|
||||||
return format
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare
|
|
||||||
def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
|
|
||||||
if self.format == "coco_detection":
|
|
||||||
image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
|
|
||||||
return image, target
|
|
||||||
elif self.format == "coco_panoptic":
|
|
||||||
image, target = self.prepare_coco_panoptic(image, target, masks_path)
|
|
||||||
return image, target
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Format {self.format} not supported")
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask
|
|
||||||
def convert_coco_poly_to_mask(self, segmentations, height, width):
|
|
||||||
|
|
||||||
try:
|
|
||||||
from pycocotools import mask as coco_mask
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError("Pycocotools is not installed in your environment.")
|
|
||||||
|
|
||||||
masks = []
|
|
||||||
for polygons in segmentations:
|
|
||||||
rles = coco_mask.frPyObjects(polygons, height, width)
|
|
||||||
mask = coco_mask.decode(rles)
|
|
||||||
if len(mask.shape) < 3:
|
|
||||||
mask = mask[..., None]
|
|
||||||
mask = np.asarray(mask, dtype=np.uint8)
|
|
||||||
mask = np.any(mask, axis=2)
|
|
||||||
masks.append(mask)
|
|
||||||
if masks:
|
|
||||||
masks = np.stack(masks, axis=0)
|
|
||||||
else:
|
|
||||||
masks = np.zeros((0, height, width), dtype=np.uint8)
|
|
||||||
|
|
||||||
return masks
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection
|
|
||||||
def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
|
|
||||||
"""
|
|
||||||
Convert the target in COCO format into the format expected by DETR.
|
|
||||||
"""
|
|
||||||
w, h = image.size
|
|
||||||
|
|
||||||
image_id = target["image_id"]
|
|
||||||
image_id = np.asarray([image_id], dtype=np.int64)
|
|
||||||
|
|
||||||
# get all COCO annotations for the given image
|
|
||||||
anno = target["annotations"]
|
|
||||||
|
|
||||||
anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
|
|
||||||
|
|
||||||
boxes = [obj["bbox"] for obj in anno]
|
|
||||||
# guard against no boxes via resizing
|
|
||||||
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
|
|
||||||
boxes[:, 2:] += boxes[:, :2]
|
|
||||||
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
|
|
||||||
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
|
|
||||||
|
|
||||||
classes = [obj["category_id"] for obj in anno]
|
|
||||||
classes = np.asarray(classes, dtype=np.int64)
|
|
||||||
|
|
||||||
if return_segmentation_masks:
|
|
||||||
segmentations = [obj["segmentation"] for obj in anno]
|
|
||||||
masks = self.convert_coco_poly_to_mask(segmentations, h, w)
|
|
||||||
|
|
||||||
keypoints = None
|
|
||||||
if anno and "keypoints" in anno[0]:
|
|
||||||
keypoints = [obj["keypoints"] for obj in anno]
|
|
||||||
keypoints = np.asarray(keypoints, dtype=np.float32)
|
|
||||||
num_keypoints = keypoints.shape[0]
|
|
||||||
if num_keypoints:
|
|
||||||
keypoints = keypoints.reshape((-1, 3))
|
|
||||||
|
|
||||||
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
|
||||||
boxes = boxes[keep]
|
|
||||||
classes = classes[keep]
|
|
||||||
if return_segmentation_masks:
|
|
||||||
masks = masks[keep]
|
|
||||||
if keypoints is not None:
|
|
||||||
keypoints = keypoints[keep]
|
|
||||||
|
|
||||||
target = {}
|
|
||||||
target["boxes"] = boxes
|
|
||||||
target["class_labels"] = classes
|
|
||||||
if return_segmentation_masks:
|
|
||||||
target["masks"] = masks
|
|
||||||
target["image_id"] = image_id
|
|
||||||
if keypoints is not None:
|
|
||||||
target["keypoints"] = keypoints
|
|
||||||
|
|
||||||
# for conversion to coco api
|
|
||||||
area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
|
|
||||||
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
|
|
||||||
target["area"] = area[keep]
|
|
||||||
target["iscrowd"] = iscrowd[keep]
|
|
||||||
|
|
||||||
target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic
|
|
||||||
def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
|
|
||||||
w, h = image.size
|
|
||||||
ann_info = target.copy()
|
|
||||||
ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
|
|
||||||
|
|
||||||
if "segments_info" in ann_info:
|
|
||||||
masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
|
|
||||||
masks = rgb_to_id(masks)
|
|
||||||
|
|
||||||
ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
|
|
||||||
masks = masks == ids[:, None, None]
|
|
||||||
masks = np.asarray(masks, dtype=np.uint8)
|
|
||||||
|
|
||||||
labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
|
|
||||||
|
|
||||||
target = {}
|
|
||||||
target["image_id"] = np.asarray(
|
|
||||||
[ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
|
|
||||||
)
|
|
||||||
if return_masks:
|
|
||||||
target["masks"] = masks
|
|
||||||
target["class_labels"] = labels
|
|
||||||
|
|
||||||
target["boxes"] = masks_to_boxes(masks)
|
|
||||||
|
|
||||||
target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
if "segments_info" in ann_info:
|
|
||||||
target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
|
|
||||||
target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
|
|
||||||
def _resize(self, image, size, target=None, max_size=None):
|
|
||||||
"""
|
|
||||||
Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
|
|
||||||
edge of the image will be matched to this number.
|
|
||||||
|
|
||||||
If given, also resize the target accordingly.
|
|
||||||
"""
|
|
||||||
if not isinstance(image, Image.Image):
|
|
||||||
image = self.to_pil_image(image)
|
|
||||||
|
|
||||||
def get_size_with_aspect_ratio(image_size, size, max_size=None):
|
|
||||||
w, h = image_size
|
|
||||||
if max_size is not None:
|
|
||||||
min_original_size = float(min((w, h)))
|
|
||||||
max_original_size = float(max((w, h)))
|
|
||||||
if max_original_size / min_original_size * size > max_size:
|
|
||||||
size = int(round(max_size * min_original_size / max_original_size))
|
|
||||||
|
|
||||||
if (w <= h and w == size) or (h <= w and h == size):
|
|
||||||
return (h, w)
|
|
||||||
|
|
||||||
if w < h:
|
|
||||||
ow = size
|
|
||||||
oh = int(size * h / w)
|
|
||||||
else:
|
|
||||||
oh = size
|
|
||||||
ow = int(size * w / h)
|
|
||||||
|
|
||||||
return (oh, ow)
|
|
||||||
|
|
||||||
def get_size(image_size, size, max_size=None):
|
|
||||||
if isinstance(size, (list, tuple)):
|
|
||||||
return size
|
|
||||||
else:
|
|
||||||
# size returned must be (w, h) since we use PIL to resize images
|
|
||||||
# so we revert the tuple
|
|
||||||
return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
|
|
||||||
|
|
||||||
size = get_size(image.size, size, max_size)
|
|
||||||
rescaled_image = self.resize(image, size=size)
|
|
||||||
|
|
||||||
if target is None:
|
|
||||||
return rescaled_image, None
|
|
||||||
|
|
||||||
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
|
|
||||||
ratio_width, ratio_height = ratios
|
|
||||||
|
|
||||||
target = target.copy()
|
|
||||||
if "boxes" in target:
|
|
||||||
boxes = target["boxes"]
|
|
||||||
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
|
|
||||||
target["boxes"] = scaled_boxes
|
|
||||||
|
|
||||||
if "area" in target:
|
|
||||||
area = target["area"]
|
|
||||||
scaled_area = area * (ratio_width * ratio_height)
|
|
||||||
target["area"] = scaled_area
|
|
||||||
|
|
||||||
w, h = size
|
|
||||||
target["size"] = np.asarray([h, w], dtype=np.int64)
|
|
||||||
|
|
||||||
if "masks" in target:
|
|
||||||
# use PyTorch as current workaround
|
|
||||||
# TODO replace by self.resize
|
|
||||||
masks = torch.from_numpy(target["masks"][:, None]).float()
|
|
||||||
interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
|
|
||||||
target["masks"] = interpolated_masks.numpy()
|
|
||||||
|
|
||||||
return rescaled_image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
|
|
||||||
def _normalize(self, image, mean, std, target=None):
|
|
||||||
"""
|
|
||||||
Normalize the image with a certain mean and std.
|
|
||||||
|
|
||||||
If given, also normalize the target bounding boxes based on the size of the image.
|
|
||||||
"""
|
|
||||||
|
|
||||||
image = self.normalize(image, mean=mean, std=std)
|
|
||||||
if target is None:
|
|
||||||
return image, None
|
|
||||||
|
|
||||||
target = target.copy()
|
|
||||||
h, w = image.shape[-2:]
|
|
||||||
|
|
||||||
if "boxes" in target:
|
|
||||||
boxes = target["boxes"]
|
|
||||||
boxes = corners_to_center_format(boxes)
|
|
||||||
boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
|
|
||||||
target["boxes"] = boxes
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__call__
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
images: ImageInput,
|
|
||||||
annotations: Union[List[Dict], List[List[Dict]]] = None,
|
|
||||||
return_segmentation_masks: Optional[bool] = False,
|
|
||||||
masks_path: Optional[pathlib.Path] = None,
|
|
||||||
pad_and_return_pixel_mask: Optional[bool] = True,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> BatchFeature:
|
|
||||||
"""
|
|
||||||
Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
|
|
||||||
padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
|
|
||||||
real/which are padding.
|
|
||||||
|
|
||||||
<Tip warning={true}>
|
|
||||||
|
|
||||||
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
|
|
||||||
PIL images.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
||||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
||||||
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
|
|
||||||
number of channels, H and W are image height and width.
|
|
||||||
|
|
||||||
annotations (`Dict`, `List[Dict]`, *optional*):
|
|
||||||
The corresponding annotations in COCO format.
|
|
||||||
|
|
||||||
In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for
|
|
||||||
each image should have the following format: {'image_id': int, 'annotations': [annotation]}, with the
|
|
||||||
annotations being a list of COCO object annotations.
|
|
||||||
|
|
||||||
In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for
|
|
||||||
each image should have the following format: {'image_id': int, 'file_name': str, 'segments_info':
|
|
||||||
[segment_info]} with segments_info being a list of COCO panoptic annotations.
|
|
||||||
|
|
||||||
return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
|
|
||||||
Whether to also include instance segmentation masks as part of the labels in case `format =
|
|
||||||
"coco_detection"`.
|
|
||||||
|
|
||||||
masks_path (`pathlib.Path`, *optional*):
|
|
||||||
Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
|
|
||||||
relevant in case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
|
|
||||||
|
|
||||||
pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to pad images up to the largest image in a batch and create a pixel mask.
|
|
||||||
|
|
||||||
If left to the default, will return a pixel mask that is:
|
|
||||||
|
|
||||||
- 1 for pixels that are real (i.e. **not masked**),
|
|
||||||
- 0 for pixels that are padding (i.e. **masked**).
|
|
||||||
|
|
||||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
||||||
If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
|
|
||||||
objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
- **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
|
|
||||||
*"pixel_mask"* is in `self.model_input_names`).
|
|
||||||
- **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
|
|
||||||
"""
|
|
||||||
# Input type checking for clearer error
|
|
||||||
|
|
||||||
valid_images = False
|
|
||||||
valid_annotations = False
|
|
||||||
valid_masks_path = False
|
|
||||||
|
|
||||||
# Check that images has a valid type
|
|
||||||
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
|
|
||||||
valid_images = True
|
|
||||||
elif isinstance(images, (list, tuple)):
|
|
||||||
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
|
|
||||||
valid_images = True
|
|
||||||
|
|
||||||
if not valid_images:
|
|
||||||
raise ValueError(
|
|
||||||
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
|
|
||||||
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
|
|
||||||
)
|
|
||||||
|
|
||||||
is_batched = bool(
|
|
||||||
isinstance(images, (list, tuple))
|
|
||||||
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check that annotations has a valid type
|
|
||||||
if annotations is not None:
|
|
||||||
if not is_batched:
|
|
||||||
if self.format == "coco_detection":
|
|
||||||
if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
|
|
||||||
if isinstance(annotations["annotations"], (list, tuple)):
|
|
||||||
# an image can have no annotations
|
|
||||||
if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
|
|
||||||
valid_annotations = True
|
|
||||||
elif self.format == "coco_panoptic":
|
|
||||||
if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
|
|
||||||
if isinstance(annotations["segments_info"], (list, tuple)):
|
|
||||||
# an image can have no segments (?)
|
|
||||||
if len(annotations["segments_info"]) == 0 or isinstance(
|
|
||||||
annotations["segments_info"][0], dict
|
|
||||||
):
|
|
||||||
valid_annotations = True
|
|
||||||
else:
|
|
||||||
if isinstance(annotations, (list, tuple)):
|
|
||||||
if len(images) != len(annotations):
|
|
||||||
raise ValueError("There must be as many annotations as there are images")
|
|
||||||
if isinstance(annotations[0], Dict):
|
|
||||||
if self.format == "coco_detection":
|
|
||||||
if isinstance(annotations[0]["annotations"], (list, tuple)):
|
|
||||||
valid_annotations = True
|
|
||||||
elif self.format == "coco_panoptic":
|
|
||||||
if isinstance(annotations[0]["segments_info"], (list, tuple)):
|
|
||||||
valid_annotations = True
|
|
||||||
|
|
||||||
if not valid_annotations:
|
|
||||||
raise ValueError(
|
|
||||||
"""
|
|
||||||
Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
|
|
||||||
detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
|
|
||||||
being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
|
|
||||||
should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
|
|
||||||
of annotations in COCO format.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check that masks_path has a valid type
|
|
||||||
if masks_path is not None:
|
|
||||||
if self.format == "coco_panoptic":
|
|
||||||
if isinstance(masks_path, pathlib.Path):
|
|
||||||
valid_masks_path = True
|
|
||||||
if not valid_masks_path:
|
|
||||||
raise ValueError(
|
|
||||||
"The path to the directory containing the mask PNG files should be provided as a"
|
|
||||||
" `pathlib.Path` object."
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_batched:
|
|
||||||
images = [images]
|
|
||||||
if annotations is not None:
|
|
||||||
annotations = [annotations]
|
|
||||||
|
|
||||||
# Create a copy of the list to avoid editing it in place
|
|
||||||
images = [image for image in images]
|
|
||||||
|
|
||||||
if annotations is not None:
|
|
||||||
annotations = [annotation for annotation in annotations]
|
|
||||||
|
|
||||||
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
|
|
||||||
if annotations is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, annotations)):
|
|
||||||
if not isinstance(image, Image.Image):
|
|
||||||
image = self.to_pil_image(image)
|
|
||||||
image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
|
|
||||||
images[idx] = image
|
|
||||||
annotations[idx] = target
|
|
||||||
|
|
||||||
# transformations (resizing + normalization)
|
|
||||||
if self.do_resize and self.size is not None:
|
|
||||||
if annotations is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, annotations)):
|
|
||||||
image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
|
|
||||||
images[idx] = image
|
|
||||||
annotations[idx] = target
|
|
||||||
else:
|
|
||||||
for idx, image in enumerate(images):
|
|
||||||
images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
|
|
||||||
|
|
||||||
if self.do_normalize:
|
|
||||||
if annotations is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, annotations)):
|
|
||||||
image, target = self._normalize(
|
|
||||||
image=image, mean=self.image_mean, std=self.image_std, target=target
|
|
||||||
)
|
|
||||||
images[idx] = image
|
|
||||||
annotations[idx] = target
|
|
||||||
else:
|
|
||||||
images = [
|
|
||||||
self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
images = [np.array(image) for image in images]
|
|
||||||
|
|
||||||
if pad_and_return_pixel_mask:
|
|
||||||
# pad images up to largest image in batch and create pixel_mask
|
|
||||||
max_size = self._max_by_axis([list(image.shape) for image in images])
|
|
||||||
c, h, w = max_size
|
|
||||||
padded_images = []
|
|
||||||
pixel_mask = []
|
|
||||||
for image in images:
|
|
||||||
# create padded image
|
|
||||||
padded_image = np.zeros((c, h, w), dtype=np.float32)
|
|
||||||
padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
|
|
||||||
padded_images.append(padded_image)
|
|
||||||
# create pixel mask
|
|
||||||
mask = np.zeros((h, w), dtype=np.int64)
|
|
||||||
mask[: image.shape[1], : image.shape[2]] = True
|
|
||||||
pixel_mask.append(mask)
|
|
||||||
images = padded_images
|
|
||||||
|
|
||||||
# return as BatchFeature
|
|
||||||
data = {}
|
|
||||||
data["pixel_values"] = images
|
|
||||||
if pad_and_return_pixel_mask:
|
|
||||||
data["pixel_mask"] = pixel_mask
|
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|
||||||
if annotations is not None:
|
|
||||||
# Convert to TensorType
|
|
||||||
tensor_type = return_tensors
|
|
||||||
if not isinstance(tensor_type, TensorType):
|
|
||||||
tensor_type = TensorType(tensor_type)
|
|
||||||
|
|
||||||
if not tensor_type == TensorType.PYTORCH:
|
|
||||||
raise ValueError("Only PyTorch is supported for the moment.")
|
|
||||||
else:
|
|
||||||
if not is_torch_available():
|
|
||||||
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
|
|
||||||
|
|
||||||
encoded_inputs["labels"] = [
|
|
||||||
{k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
|
|
||||||
def _max_by_axis(self, the_list):
|
|
||||||
# type: (List[List[int]]) -> List[int]
|
|
||||||
maxes = the_list[0]
|
|
||||||
for sublist in the_list[1:]:
|
|
||||||
for index, item in enumerate(sublist):
|
|
||||||
maxes[index] = max(maxes[index], item)
|
|
||||||
return maxes
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.pad_and_create_pixel_mask
|
|
||||||
def pad_and_create_pixel_mask(
|
|
||||||
self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pixel_values_list (`List[torch.Tensor]`):
|
|
||||||
List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
|
|
||||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
||||||
If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
|
|
||||||
objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
- **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
|
|
||||||
*"pixel_mask"* is in `self.model_input_names`).
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
|
|
||||||
c, h, w = max_size
|
|
||||||
padded_images = []
|
|
||||||
pixel_mask = []
|
|
||||||
for image in pixel_values_list:
|
|
||||||
# create padded image
|
|
||||||
padded_image = np.zeros((c, h, w), dtype=np.float32)
|
|
||||||
padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
|
|
||||||
padded_images.append(padded_image)
|
|
||||||
# create pixel mask
|
|
||||||
mask = np.zeros((h, w), dtype=np.int64)
|
|
||||||
mask[: image.shape[1], : image.shape[2]] = True
|
|
||||||
pixel_mask.append(mask)
|
|
||||||
|
|
||||||
# return as BatchFeature
|
|
||||||
data = {"pixel_values": padded_images, "pixel_mask": pixel_mask}
|
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|
||||||
def post_process(self, outputs, target_sizes):
|
|
||||||
"""
|
|
||||||
Converts the output of [`DeformableDetrForObjectDetection`] into the format expected by the COCO api. Only
|
|
||||||
supports PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`DeformableDetrObjectDetectionOutput`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
|
|
||||||
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
|
|
||||||
original image size (before any data augmentation). For visualization, this should be the image size
|
|
||||||
after data augment, but before padding.
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
|
||||||
in the batch as predicted by the model.
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
|
|
||||||
" `post_process_object_detection`.",
|
|
||||||
FutureWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
|
||||||
|
|
||||||
if len(out_logits) != len(target_sizes):
|
|
||||||
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
|
||||||
if target_sizes.shape[1] != 2:
|
|
||||||
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
|
||||||
|
|
||||||
prob = out_logits.sigmoid()
|
|
||||||
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
|
|
||||||
scores = topk_values
|
|
||||||
topk_boxes = topk_indexes // out_logits.shape[2]
|
|
||||||
labels = topk_indexes % out_logits.shape[2]
|
|
||||||
boxes = center_to_corners_format(out_bbox)
|
|
||||||
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
|
||||||
|
|
||||||
# and from relative [0, 1] to absolute [0, height] coordinates
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
def post_process_object_detection(
|
|
||||||
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Converts the output of [`DeformableDetrForObjectDetection`] into the format expected by the COCO api. Only
|
|
||||||
supports PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`DetrObjectDetectionOutput`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
threshold (`float`, *optional*):
|
|
||||||
Score threshold to keep object detection predictions.
|
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*, defaults to `None`):
|
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
|
||||||
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
|
||||||
in the batch as predicted by the model.
|
|
||||||
"""
|
|
||||||
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
|
||||||
|
|
||||||
if target_sizes is not None:
|
|
||||||
if len(out_logits) != len(target_sizes):
|
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
prob = out_logits.sigmoid()
|
|
||||||
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
|
|
||||||
scores = topk_values
|
|
||||||
topk_boxes = topk_indexes // out_logits.shape[2]
|
|
||||||
labels = topk_indexes % out_logits.shape[2]
|
|
||||||
boxes = center_to_corners_format(out_bbox)
|
|
||||||
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
|
|
||||||
|
|
||||||
# and from relative [0, 1] to absolute [0, height] coordinates
|
|
||||||
if isinstance(target_sizes, List):
|
|
||||||
img_h = torch.Tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.Tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for s, l, b in zip(scores, labels, boxes):
|
|
||||||
score = s[s > threshold]
|
|
||||||
label = l[s > threshold]
|
|
||||||
box = b[s > threshold]
|
|
||||||
results.append({"scores": score, "labels": label, "boxes": box})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -30,6 +30,7 @@ except OptionalDependencyNotAvailable:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
|
_import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
|
||||||
|
_import_structure["image_processing_detr"] = ["DetrImageProcessor"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_timm_available():
|
if not is_timm_available():
|
||||||
@@ -56,6 +57,7 @@ if TYPE_CHECKING:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .feature_extraction_detr import DetrFeatureExtractor
|
from .feature_extraction_detr import DetrFeatureExtractor
|
||||||
|
from .image_processing_detr import DetrImageProcessor
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_timm_available():
|
if not is_timm_available():
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
1766
src/transformers/models/detr/image_processing_detr.py
Normal file
1766
src/transformers/models/detr/image_processing_detr.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1589,7 +1589,7 @@ class DetrForSegmentation(DetrPreTrainedModel):
|
|||||||
>>> import numpy
|
>>> import numpy
|
||||||
|
|
||||||
>>> from transformers import DetrFeatureExtractor, DetrForSegmentation
|
>>> from transformers import DetrFeatureExtractor, DetrForSegmentation
|
||||||
>>> from transformers.models.detr.feature_extraction_detr import rgb_to_id
|
>>> from transformers.image_transforms import rgb_to_id
|
||||||
|
|
||||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
@@ -2289,8 +2289,6 @@ def generalized_box_iou(boxes1, boxes2):
|
|||||||
|
|
||||||
|
|
||||||
# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
|
# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
|
||||||
|
|
||||||
|
|
||||||
def _max_by_axis(the_list):
|
def _max_by_axis(the_list):
|
||||||
# type: (List[List[int]]) -> List[int]
|
# type: (List[List[int]]) -> List[int]
|
||||||
maxes = the_list[0]
|
maxes = the_list[0]
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ except OptionalDependencyNotAvailable:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_import_structure["feature_extraction_maskformer"] = ["MaskFormerFeatureExtractor"]
|
_import_structure["feature_extraction_maskformer"] = ["MaskFormerFeatureExtractor"]
|
||||||
|
_import_structure["image_processing_maskformer"] = ["MaskFormerImageProcessor"]
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -63,6 +64,7 @@ if TYPE_CHECKING:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .feature_extraction_maskformer import MaskFormerFeatureExtractor
|
from .feature_extraction_maskformer import MaskFormerFeatureExtractor
|
||||||
|
from .image_processing_maskformer import MaskFormerImageProcessor
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
raise OptionalDependencyNotAvailable()
|
raise OptionalDependencyNotAvailable()
|
||||||
|
|||||||
@@ -14,923 +14,11 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Feature extractor class for MaskFormer."""
|
"""Feature extractor class for MaskFormer."""
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
|
from transformers.utils import logging
|
||||||
|
|
||||||
import numpy as np
|
from .image_processing_maskformer import MaskFormerImageProcessor
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from transformers.image_utils import PILImageResampling
|
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
|
|
||||||
from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
|
|
||||||
from ...utils import TensorType, is_torch_available, logging
|
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
|
||||||
import torch
|
|
||||||
from torch import Tensor, nn
|
|
||||||
from torch.nn.functional import interpolate
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
MaskFormerFeatureExtractor = MaskFormerImageProcessor
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.binary_mask_to_rle
|
|
||||||
def binary_mask_to_rle(mask):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
Converts given binary mask of shape (height, width) to the run-length encoding (RLE) format.
|
|
||||||
mask (`torch.Tensor` or `numpy.array`):
|
|
||||||
A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
|
|
||||||
segment_id or class_id.
|
|
||||||
Returns:
|
|
||||||
`List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
|
|
||||||
format.
|
|
||||||
"""
|
|
||||||
if is_torch_tensor(mask):
|
|
||||||
mask = mask.numpy()
|
|
||||||
|
|
||||||
pixels = mask.flatten()
|
|
||||||
pixels = np.concatenate([[0], pixels, [0]])
|
|
||||||
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
|
|
||||||
runs[1::2] -= runs[::2]
|
|
||||||
return [x for x in runs]
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.convert_segmentation_to_rle
|
|
||||||
def convert_segmentation_to_rle(segmentation):
|
|
||||||
"""
|
|
||||||
Converts given segmentation map of shape (height, width) to the run-length encoding (RLE) format.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
segmentation (`torch.Tensor` or `numpy.array`):
|
|
||||||
A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
|
|
||||||
Returns:
|
|
||||||
`List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
|
|
||||||
"""
|
|
||||||
segment_ids = torch.unique(segmentation)
|
|
||||||
|
|
||||||
run_length_encodings = []
|
|
||||||
for idx in segment_ids:
|
|
||||||
mask = torch.where(segmentation == idx, 1, 0)
|
|
||||||
rle = binary_mask_to_rle(mask)
|
|
||||||
run_length_encodings.append(rle)
|
|
||||||
|
|
||||||
return run_length_encodings
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.remove_low_and_no_objects
|
|
||||||
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
|
|
||||||
"""
|
|
||||||
Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
|
|
||||||
`labels`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
masks (`torch.Tensor`):
|
|
||||||
A tensor of shape `(num_queries, height, width)`.
|
|
||||||
scores (`torch.Tensor`):
|
|
||||||
A tensor of shape `(num_queries)`.
|
|
||||||
labels (`torch.Tensor`):
|
|
||||||
A tensor of shape `(num_queries)`.
|
|
||||||
object_mask_threshold (`float`):
|
|
||||||
A number between 0 and 1 used to binarize the masks.
|
|
||||||
Raises:
|
|
||||||
`ValueError`: Raised when the first dimension doesn't match in all input tensors.
|
|
||||||
Returns:
|
|
||||||
`Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
|
|
||||||
< `object_mask_threshold`.
|
|
||||||
"""
|
|
||||||
if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
|
|
||||||
raise ValueError("mask, scores and labels must have the same shape!")
|
|
||||||
|
|
||||||
to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
|
|
||||||
|
|
||||||
return masks[to_keep], scores[to_keep], labels[to_keep]
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.check_segment_validity
|
|
||||||
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
|
|
||||||
# Get the mask associated with the k class
|
|
||||||
mask_k = mask_labels == k
|
|
||||||
mask_k_area = mask_k.sum()
|
|
||||||
|
|
||||||
# Compute the area of all the stuff in query k
|
|
||||||
original_area = (mask_probs[k] >= mask_threshold).sum()
|
|
||||||
mask_exists = mask_k_area > 0 and original_area > 0
|
|
||||||
|
|
||||||
# Eliminate disconnected tiny segments
|
|
||||||
if mask_exists:
|
|
||||||
area_ratio = mask_k_area / original_area
|
|
||||||
if not area_ratio.item() > overlap_mask_area_threshold:
|
|
||||||
mask_exists = False
|
|
||||||
|
|
||||||
return mask_exists, mask_k
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.compute_segments
|
|
||||||
def compute_segments(
|
|
||||||
mask_probs,
|
|
||||||
pred_scores,
|
|
||||||
pred_labels,
|
|
||||||
mask_threshold: float = 0.5,
|
|
||||||
overlap_mask_area_threshold: float = 0.8,
|
|
||||||
label_ids_to_fuse: Optional[Set[int]] = None,
|
|
||||||
target_size: Tuple[int, int] = None,
|
|
||||||
):
|
|
||||||
height = mask_probs.shape[1] if target_size is None else target_size[0]
|
|
||||||
width = mask_probs.shape[2] if target_size is None else target_size[1]
|
|
||||||
|
|
||||||
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
|
|
||||||
segments: List[Dict] = []
|
|
||||||
|
|
||||||
if target_size is not None:
|
|
||||||
mask_probs = nn.functional.interpolate(
|
|
||||||
mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
current_segment_id = 0
|
|
||||||
|
|
||||||
# Weigh each mask by its prediction score
|
|
||||||
mask_probs *= pred_scores.view(-1, 1, 1)
|
|
||||||
mask_labels = mask_probs.argmax(0) # [height, width]
|
|
||||||
|
|
||||||
# Keep track of instances of each class
|
|
||||||
stuff_memory_list: Dict[str, int] = {}
|
|
||||||
for k in range(pred_labels.shape[0]):
|
|
||||||
pred_class = pred_labels[k].item()
|
|
||||||
should_fuse = pred_class in label_ids_to_fuse
|
|
||||||
|
|
||||||
# Check if mask exists and large enough to be a segment
|
|
||||||
mask_exists, mask_k = check_segment_validity(
|
|
||||||
mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
|
|
||||||
)
|
|
||||||
|
|
||||||
if mask_exists:
|
|
||||||
if pred_class in stuff_memory_list:
|
|
||||||
current_segment_id = stuff_memory_list[pred_class]
|
|
||||||
else:
|
|
||||||
current_segment_id += 1
|
|
||||||
|
|
||||||
# Add current object segment to final segmentation map
|
|
||||||
segmentation[mask_k] = current_segment_id
|
|
||||||
segment_score = round(pred_scores[k].item(), 6)
|
|
||||||
segments.append(
|
|
||||||
{
|
|
||||||
"id": current_segment_id,
|
|
||||||
"label_id": pred_class,
|
|
||||||
"was_fused": should_fuse,
|
|
||||||
"score": segment_score,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if should_fuse:
|
|
||||||
stuff_memory_list[pred_class] = current_segment_id
|
|
||||||
|
|
||||||
return segmentation, segments
|
|
||||||
|
|
||||||
|
|
||||||
class MaskFormerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
|
|
||||||
r"""
|
|
||||||
Constructs a MaskFormer feature extractor. The feature extractor can be used to prepare image(s) and optional
|
|
||||||
targets for the model.
|
|
||||||
|
|
||||||
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
|
|
||||||
should refer to this superclass for more information regarding those methods.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to resize the input to a certain `size`.
|
|
||||||
size (`int`, *optional*, defaults to 800):
|
|
||||||
Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
|
|
||||||
sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
|
|
||||||
the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
|
|
||||||
height / width, size)`.
|
|
||||||
max_size (`int`, *optional*, defaults to 1333):
|
|
||||||
The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
|
|
||||||
set to `True`.
|
|
||||||
resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
|
||||||
An optional resampling filter. This can be one of `PILImageResampling.NEAREST`, `PILImageResampling.BOX`,
|
|
||||||
`PILImageResampling.BILINEAR`, `PILImageResampling.HAMMING`, `PILImageResampling.BICUBIC` or
|
|
||||||
`PILImageResampling.LANCZOS`. Only has an effect if `do_resize` is set to `True`.
|
|
||||||
size_divisibility (`int`, *optional*, defaults to 32):
|
|
||||||
Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
|
|
||||||
Swin Transformer.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to normalize the input with mean and standard deviation.
|
|
||||||
image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
|
|
||||||
The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
|
|
||||||
image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
|
|
||||||
The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
|
|
||||||
ImageNet std.
|
|
||||||
ignore_index (`int`, *optional*):
|
|
||||||
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
|
||||||
denoted with 0 (background) will be replaced with `ignore_index`. The ignore index of the loss function of
|
|
||||||
the model should then correspond to this ignore index.
|
|
||||||
reduce_labels (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
|
|
||||||
is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
|
|
||||||
The background label will be replaced by `ignore_index`.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
do_resize=True,
|
|
||||||
size=800,
|
|
||||||
max_size=1333,
|
|
||||||
resample=PILImageResampling.BILINEAR,
|
|
||||||
size_divisibility=32,
|
|
||||||
do_normalize=True,
|
|
||||||
image_mean=None,
|
|
||||||
image_std=None,
|
|
||||||
ignore_index=None,
|
|
||||||
reduce_labels=False,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.do_resize = do_resize
|
|
||||||
self.size = size
|
|
||||||
self.max_size = max_size
|
|
||||||
self.resample = resample
|
|
||||||
self.size_divisibility = size_divisibility
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406] # ImageNet mean
|
|
||||||
self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225] # ImageNet std
|
|
||||||
self.ignore_index = ignore_index
|
|
||||||
self.reduce_labels = reduce_labels
|
|
||||||
|
|
||||||
def _resize_with_size_divisibility(self, image, size, target=None, max_size=None):
|
|
||||||
"""
|
|
||||||
Resize the image to the given size. Size can be min_size (scalar) or (width, height) tuple. If size is an int,
|
|
||||||
smaller edge of the image will be matched to this number.
|
|
||||||
|
|
||||||
If given, also resize the target accordingly.
|
|
||||||
"""
|
|
||||||
if not isinstance(image, Image.Image):
|
|
||||||
image = self.to_pil_image(image)
|
|
||||||
|
|
||||||
def get_size_with_aspect_ratio(image_size, size, max_size=None):
|
|
||||||
width, height = image_size
|
|
||||||
if max_size is not None:
|
|
||||||
min_original_size = float(min((width, height)))
|
|
||||||
max_original_size = float(max((width, height)))
|
|
||||||
if max_original_size / min_original_size * size > max_size:
|
|
||||||
size = int(round(max_size * min_original_size / max_original_size))
|
|
||||||
|
|
||||||
if (width <= height and width == size) or (height <= width and height == size):
|
|
||||||
return (height, width)
|
|
||||||
|
|
||||||
if width < height:
|
|
||||||
output_width = size
|
|
||||||
output_height = int(size * height / width)
|
|
||||||
else:
|
|
||||||
output_height = size
|
|
||||||
output_width = int(size * width / height)
|
|
||||||
|
|
||||||
return (output_height, output_width)
|
|
||||||
|
|
||||||
def get_size(image_size, size, max_size=None):
|
|
||||||
if isinstance(size, (list, tuple)):
|
|
||||||
return size
|
|
||||||
else:
|
|
||||||
# size returned must be (width, height) since we use PIL to resize images
|
|
||||||
# so we revert the tuple
|
|
||||||
return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
|
|
||||||
|
|
||||||
width, height = get_size(image.size, size, max_size)
|
|
||||||
|
|
||||||
if self.size_divisibility > 0:
|
|
||||||
height = int(np.ceil(height / self.size_divisibility)) * self.size_divisibility
|
|
||||||
width = int(np.ceil(width / self.size_divisibility)) * self.size_divisibility
|
|
||||||
|
|
||||||
size = (width, height)
|
|
||||||
image = self.resize(image, size=size, resample=self.resample)
|
|
||||||
|
|
||||||
if target is not None:
|
|
||||||
target = self.resize(target, size=size, resample=Image.NEAREST)
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
images: ImageInput,
|
|
||||||
segmentation_maps: ImageInput = None,
|
|
||||||
pad_and_return_pixel_mask: Optional[bool] = True,
|
|
||||||
instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> BatchFeature:
|
|
||||||
"""
|
|
||||||
Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
|
|
||||||
padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
|
|
||||||
real/which are padding.
|
|
||||||
|
|
||||||
Segmentation maps can be instance, semantic or panoptic segmentation maps. In case of instance and panoptic
|
|
||||||
segmentation, one needs to provide `instance_id_to_semantic_id`, which is a mapping from instance/segment ids
|
|
||||||
to semantic category ids.
|
|
||||||
|
|
||||||
MaskFormer addresses all 3 forms of segmentation (instance, semantic and panoptic) in the same way, namely by
|
|
||||||
converting the segmentation maps to a set of binary masks with corresponding classes.
|
|
||||||
|
|
||||||
In case of instance segmentation, the segmentation maps contain the instance ids, and
|
|
||||||
`instance_id_to_semantic_id` maps instance IDs to their corresponding semantic category.
|
|
||||||
|
|
||||||
In case of semantic segmentation, the segmentation maps contain the semantic category ids. Let's see an
|
|
||||||
example, assuming `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels =
|
|
||||||
[[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for
|
|
||||||
each mask.
|
|
||||||
|
|
||||||
In case of panoptic segmentation, the segmentation maps contain the segment ids, and
|
|
||||||
`instance_id_to_semantic_id` maps segment IDs to their corresponding semantic category.
|
|
||||||
|
|
||||||
<Tip warning={true}>
|
|
||||||
|
|
||||||
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
|
|
||||||
PIL images.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
||||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
||||||
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
|
|
||||||
number of channels, H and W are image height and width.
|
|
||||||
|
|
||||||
segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
|
|
||||||
The corresponding segmentation maps with the pixel-wise instance id, semantic id or segment id
|
|
||||||
annotations. Assumed to be semantic segmentation maps if no `instance_id_to_semantic_id map` is
|
|
||||||
provided.
|
|
||||||
|
|
||||||
pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to pad images up to the largest image in a batch and create a pixel mask.
|
|
||||||
|
|
||||||
If left to the default, will return a pixel mask that is:
|
|
||||||
|
|
||||||
- 1 for pixels that are real (i.e. **not masked**),
|
|
||||||
- 0 for pixels that are padding (i.e. **masked**).
|
|
||||||
|
|
||||||
instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
|
|
||||||
A mapping between instance/segment ids and semantic category ids. If passed, `segmentation_maps` is
|
|
||||||
treated as an instance or panoptic segmentation map where each pixel represents an instance or segment
|
|
||||||
id. Can be provided as a single dictionary with a global / dataset-level mapping or as a list of
|
|
||||||
dictionaries (one per image), to map instance ids in each image separately. Note that this assumes a
|
|
||||||
mapping before reduction of labels.
|
|
||||||
|
|
||||||
return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
|
|
||||||
If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
|
|
||||||
objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
- **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
|
|
||||||
`pixel_mask` is in `self.model_input_names`).
|
|
||||||
- **mask_labels** -- Optional list of mask labels of shape `(num_class_labels, height, width)` to be fed to
|
|
||||||
a model (when `annotations` are provided).
|
|
||||||
- **class_labels** -- Optional list of class labels of shape `(num_class_labels)` to be fed to a model
|
|
||||||
(when `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of
|
|
||||||
`mask_labels[i][j]` if `class_labels[i][j]`.
|
|
||||||
"""
|
|
||||||
# Input type checking for clearer error
|
|
||||||
|
|
||||||
valid_images = False
|
|
||||||
valid_segmentation_maps = False
|
|
||||||
|
|
||||||
# Check that images has a valid type
|
|
||||||
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
|
|
||||||
valid_images = True
|
|
||||||
elif isinstance(images, (list, tuple)):
|
|
||||||
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
|
|
||||||
valid_images = True
|
|
||||||
|
|
||||||
if not valid_images:
|
|
||||||
raise ValueError(
|
|
||||||
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
|
|
||||||
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
|
|
||||||
)
|
|
||||||
# Check that segmentation maps has a valid type
|
|
||||||
if segmentation_maps is not None:
|
|
||||||
if isinstance(segmentation_maps, (Image.Image, np.ndarray)) or is_torch_tensor(segmentation_maps):
|
|
||||||
valid_segmentation_maps = True
|
|
||||||
elif isinstance(segmentation_maps, (list, tuple)):
|
|
||||||
if (
|
|
||||||
len(segmentation_maps) == 0
|
|
||||||
or isinstance(segmentation_maps[0], (Image.Image, np.ndarray))
|
|
||||||
or is_torch_tensor(segmentation_maps[0])
|
|
||||||
):
|
|
||||||
valid_segmentation_maps = True
|
|
||||||
|
|
||||||
if not valid_segmentation_maps:
|
|
||||||
raise ValueError(
|
|
||||||
"Segmentation maps must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single"
|
|
||||||
" example),`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of"
|
|
||||||
" examples)."
|
|
||||||
)
|
|
||||||
|
|
||||||
is_batched = bool(
|
|
||||||
isinstance(images, (list, tuple))
|
|
||||||
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_batched:
|
|
||||||
images = [images]
|
|
||||||
if segmentation_maps is not None:
|
|
||||||
segmentation_maps = [segmentation_maps]
|
|
||||||
|
|
||||||
# transformations (resizing + normalization)
|
|
||||||
if self.do_resize and self.size is not None:
|
|
||||||
if segmentation_maps is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, segmentation_maps)):
|
|
||||||
image, target = self._resize_with_size_divisibility(
|
|
||||||
image=image, target=target, size=self.size, max_size=self.max_size
|
|
||||||
)
|
|
||||||
images[idx] = image
|
|
||||||
segmentation_maps[idx] = target
|
|
||||||
else:
|
|
||||||
for idx, image in enumerate(images):
|
|
||||||
images[idx] = self._resize_with_size_divisibility(
|
|
||||||
image=image, target=None, size=self.size, max_size=self.max_size
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
if self.do_normalize:
|
|
||||||
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
|
|
||||||
# NOTE I will be always forced to pad them them since they have to be stacked in the batch dim
|
|
||||||
encoded_inputs = self.encode_inputs(
|
|
||||||
images,
|
|
||||||
segmentation_maps,
|
|
||||||
pad_and_return_pixel_mask,
|
|
||||||
instance_id_to_semantic_id=instance_id_to_semantic_id,
|
|
||||||
return_tensors=return_tensors,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert to TensorType
|
|
||||||
tensor_type = return_tensors
|
|
||||||
if not isinstance(tensor_type, TensorType):
|
|
||||||
tensor_type = TensorType(tensor_type)
|
|
||||||
|
|
||||||
if not tensor_type == TensorType.PYTORCH:
|
|
||||||
raise ValueError("Only PyTorch is supported for the moment.")
|
|
||||||
else:
|
|
||||||
if not is_torch_available():
|
|
||||||
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|
||||||
def _max_by_axis(self, the_list: List[List[int]]) -> List[int]:
|
|
||||||
maxes = the_list[0]
|
|
||||||
for sublist in the_list[1:]:
|
|
||||||
for index, item in enumerate(sublist):
|
|
||||||
maxes[index] = max(maxes[index], item)
|
|
||||||
return maxes
|
|
||||||
|
|
||||||
def convert_segmentation_map_to_binary_masks(
|
|
||||||
self,
|
|
||||||
segmentation_map: "np.ndarray",
|
|
||||||
instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
|
|
||||||
):
|
|
||||||
# Reduce labels, if requested
|
|
||||||
if self.reduce_labels:
|
|
||||||
if self.ignore_index is None:
|
|
||||||
raise ValueError("`ignore_index` must be set when `reduce_labels` is `True`.")
|
|
||||||
segmentation_map[segmentation_map == 0] = self.ignore_index
|
|
||||||
segmentation_map -= 1
|
|
||||||
segmentation_map[segmentation_map == self.ignore_index - 1] = self.ignore_index
|
|
||||||
|
|
||||||
# Get unique ids (instance, class ids or segment ids based on input)
|
|
||||||
all_labels = np.unique(segmentation_map)
|
|
||||||
|
|
||||||
# Remove ignored label
|
|
||||||
if self.ignore_index is not None:
|
|
||||||
all_labels = all_labels[all_labels != self.ignore_index]
|
|
||||||
|
|
||||||
# Generate a binary mask for each object instance
|
|
||||||
binary_masks = [(segmentation_map == i) for i in all_labels]
|
|
||||||
binary_masks = np.stack(binary_masks, axis=0) # (num_labels, height, width)
|
|
||||||
|
|
||||||
# Convert instance/segment ids to class ids
|
|
||||||
if instance_id_to_semantic_id is not None:
|
|
||||||
labels = np.zeros(all_labels.shape[0])
|
|
||||||
|
|
||||||
for label in all_labels:
|
|
||||||
class_id = instance_id_to_semantic_id[label + 1 if self.reduce_labels else label]
|
|
||||||
labels[all_labels == label] = class_id - 1 if self.reduce_labels else class_id
|
|
||||||
else:
|
|
||||||
labels = all_labels
|
|
||||||
|
|
||||||
return binary_masks.astype(np.float32), labels.astype(np.int64)
|
|
||||||
|
|
||||||
def encode_inputs(
|
|
||||||
self,
|
|
||||||
pixel_values_list: Union[List["np.ndarray"], List["torch.Tensor"]],
|
|
||||||
segmentation_maps: ImageInput = None,
|
|
||||||
pad_and_return_pixel_mask: bool = True,
|
|
||||||
instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Encode a list of pixel values and an optional list of corresponding segmentation maps.
|
|
||||||
|
|
||||||
This method is useful if you have resized and normalized your images and segmentation maps yourself, using a
|
|
||||||
library like [torchvision](https://pytorch.org/vision/stable/transforms.html) or
|
|
||||||
[albumentations](https://albumentations.ai/).
|
|
||||||
|
|
||||||
Images are padded up to the largest image in a batch, and a corresponding `pixel_mask` is created.
|
|
||||||
|
|
||||||
Segmentation maps can be instance, semantic or panoptic segmentation maps. In case of instance and panoptic
|
|
||||||
segmentation, one needs to provide `instance_id_to_semantic_id`, which is a mapping from instance/segment ids
|
|
||||||
to semantic category ids.
|
|
||||||
|
|
||||||
MaskFormer addresses all 3 forms of segmentation (instance, semantic and panoptic) in the same way, namely by
|
|
||||||
converting the segmentation maps to a set of binary masks with corresponding classes.
|
|
||||||
|
|
||||||
In case of instance segmentation, the segmentation maps contain the instance ids, and
|
|
||||||
`instance_id_to_semantic_id` maps instance IDs to their corresponding semantic category.
|
|
||||||
|
|
||||||
In case of semantic segmentation, the segmentation maps contain the semantic category ids. Let's see an
|
|
||||||
example, assuming `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels =
|
|
||||||
[[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for
|
|
||||||
each mask.
|
|
||||||
|
|
||||||
In case of panoptic segmentation, the segmentation maps contain the segment ids, and
|
|
||||||
`instance_id_to_semantic_id` maps segment IDs to their corresponding semantic category.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pixel_values_list (`List[np.ndarray]` or `List[torch.Tensor]`):
|
|
||||||
List of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height,
|
|
||||||
width)`.
|
|
||||||
|
|
||||||
segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
|
|
||||||
The corresponding segmentation maps with the pixel-wise instance id, semantic id or segment id
|
|
||||||
annotations. Assumed to be semantic segmentation maps if no `instance_id_to_semantic_id map` is
|
|
||||||
provided.
|
|
||||||
|
|
||||||
pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to pad images up to the largest image in a batch and create a pixel mask.
|
|
||||||
|
|
||||||
If left to the default, will return a pixel mask that is:
|
|
||||||
|
|
||||||
- 1 for pixels that are real (i.e. **not masked**),
|
|
||||||
- 0 for pixels that are padding (i.e. **masked**).
|
|
||||||
|
|
||||||
instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
|
|
||||||
A mapping between instance/segment ids and semantic category ids. If passed, `segmentation_maps` is
|
|
||||||
treated as an instance or panoptic segmentation map where each pixel represents an instance or segment
|
|
||||||
id. Can be provided as a single dictionary with a global / dataset-level mapping or as a list of
|
|
||||||
dictionaries (one per image), to map instance ids in each image separately. Note that this assumes a
|
|
||||||
mapping before reduction of labels.
|
|
||||||
|
|
||||||
return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
|
|
||||||
If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
|
|
||||||
objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
- **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
|
|
||||||
`pixel_mask` is in `self.model_input_names`).
|
|
||||||
- **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model
|
|
||||||
(when `annotations` are provided).
|
|
||||||
- **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when
|
|
||||||
`annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of
|
|
||||||
`mask_labels[i][j]` if `class_labels[i][j]`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
|
|
||||||
|
|
||||||
annotations = None
|
|
||||||
if segmentation_maps is not None:
|
|
||||||
segmentation_maps = map(np.array, segmentation_maps)
|
|
||||||
converted_segmentation_maps = []
|
|
||||||
|
|
||||||
for i, segmentation_map in enumerate(segmentation_maps):
|
|
||||||
# Use instance2class_id mapping per image
|
|
||||||
if isinstance(instance_id_to_semantic_id, List):
|
|
||||||
converted_segmentation_map = self.convert_segmentation_map_to_binary_masks(
|
|
||||||
segmentation_map, instance_id_to_semantic_id[i]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
converted_segmentation_map = self.convert_segmentation_map_to_binary_masks(
|
|
||||||
segmentation_map, instance_id_to_semantic_id
|
|
||||||
)
|
|
||||||
converted_segmentation_maps.append(converted_segmentation_map)
|
|
||||||
|
|
||||||
annotations = []
|
|
||||||
for mask, classes in converted_segmentation_maps:
|
|
||||||
annotations.append({"masks": mask, "classes": classes})
|
|
||||||
|
|
||||||
channels, height, width = max_size
|
|
||||||
pixel_values = []
|
|
||||||
pixel_mask = []
|
|
||||||
mask_labels = []
|
|
||||||
class_labels = []
|
|
||||||
for idx, image in enumerate(pixel_values_list):
|
|
||||||
# create padded image
|
|
||||||
padded_image = np.zeros((channels, height, width), dtype=np.float32)
|
|
||||||
padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
|
|
||||||
image = padded_image
|
|
||||||
pixel_values.append(image)
|
|
||||||
# if we have a target, pad it
|
|
||||||
if annotations:
|
|
||||||
annotation = annotations[idx]
|
|
||||||
masks = annotation["masks"]
|
|
||||||
# pad mask with `ignore_index`
|
|
||||||
masks = np.pad(
|
|
||||||
masks,
|
|
||||||
((0, 0), (0, height - masks.shape[1]), (0, width - masks.shape[2])),
|
|
||||||
constant_values=self.ignore_index,
|
|
||||||
)
|
|
||||||
annotation["masks"] = masks
|
|
||||||
# create pixel mask
|
|
||||||
mask = np.zeros((height, width), dtype=np.int64)
|
|
||||||
mask[: image.shape[1], : image.shape[2]] = True
|
|
||||||
pixel_mask.append(mask)
|
|
||||||
|
|
||||||
# return as BatchFeature
|
|
||||||
data = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
|
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
# we cannot batch them since they don't share a common class size
|
|
||||||
if annotations:
|
|
||||||
for label in annotations:
|
|
||||||
mask_labels.append(torch.from_numpy(label["masks"]))
|
|
||||||
class_labels.append(torch.from_numpy(label["classes"]))
|
|
||||||
|
|
||||||
encoded_inputs["mask_labels"] = mask_labels
|
|
||||||
encoded_inputs["class_labels"] = class_labels
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|
||||||
def post_process_segmentation(
|
|
||||||
self, outputs: "MaskFormerForInstanceSegmentationOutput", target_size: Tuple[int, int] = None
|
|
||||||
) -> "torch.Tensor":
|
|
||||||
"""
|
|
||||||
Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image segmentation predictions. Only
|
|
||||||
supports PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`MaskFormerForInstanceSegmentationOutput`]):
|
|
||||||
The outputs from [`MaskFormerForInstanceSegmentation`].
|
|
||||||
|
|
||||||
target_size (`Tuple[int, int]`, *optional*):
|
|
||||||
If set, the `masks_queries_logits` will be resized to `target_size`.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`torch.Tensor`:
|
|
||||||
A tensor of shape (`batch_size, num_class_labels, height, width`).
|
|
||||||
"""
|
|
||||||
logger.warning(
|
|
||||||
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
|
|
||||||
" `post_process_instance_segmentation`",
|
|
||||||
FutureWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
# class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1]
|
|
||||||
class_queries_logits = outputs.class_queries_logits
|
|
||||||
# masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH]
|
|
||||||
masks_queries_logits = outputs.masks_queries_logits
|
|
||||||
if target_size is not None:
|
|
||||||
masks_queries_logits = interpolate(
|
|
||||||
masks_queries_logits,
|
|
||||||
size=target_size,
|
|
||||||
mode="bilinear",
|
|
||||||
align_corners=False,
|
|
||||||
)
|
|
||||||
# remove the null class `[..., :-1]`
|
|
||||||
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
|
|
||||||
# mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH]
|
|
||||||
masks_probs = masks_queries_logits.sigmoid()
|
|
||||||
# now we want to sum over the queries,
|
|
||||||
# $ out_{c,h,w} = \sum_q p_{q,c} * m_{q,h,w} $
|
|
||||||
# where $ softmax(p) \in R^{q, c} $ is the mask classes
|
|
||||||
# and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities
|
|
||||||
# b(atch)q(uery)c(lasses), b(atch)q(uery)h(eight)w(idth)
|
|
||||||
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
|
|
||||||
|
|
||||||
return segmentation
|
|
||||||
|
|
||||||
def post_process_semantic_segmentation(
|
|
||||||
self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None
|
|
||||||
) -> "torch.Tensor":
|
|
||||||
"""
|
|
||||||
Converts the output of [`MaskFormerForInstanceSegmentation`] into semantic segmentation maps. Only supports
|
|
||||||
PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`MaskFormerForInstanceSegmentation`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
target_sizes (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
|
|
||||||
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
|
|
||||||
final size (height, width) of each prediction. If left to None, predictions will not be resized.
|
|
||||||
Returns:
|
|
||||||
`List[torch.Tensor]`:
|
|
||||||
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
|
|
||||||
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
|
|
||||||
`torch.Tensor` correspond to a semantic class id.
|
|
||||||
"""
|
|
||||||
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
|
||||||
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
|
||||||
|
|
||||||
# Remove the null class `[..., :-1]`
|
|
||||||
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
|
|
||||||
masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
|
|
||||||
|
|
||||||
# Semantic segmentation logits of shape (batch_size, num_classes, height, width)
|
|
||||||
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
|
|
||||||
batch_size = class_queries_logits.shape[0]
|
|
||||||
|
|
||||||
# Resize logits and compute semantic segmentation maps
|
|
||||||
if target_sizes is not None:
|
|
||||||
if batch_size != len(target_sizes):
|
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
semantic_segmentation = []
|
|
||||||
for idx in range(batch_size):
|
|
||||||
resized_logits = torch.nn.functional.interpolate(
|
|
||||||
segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
|
|
||||||
)
|
|
||||||
semantic_map = resized_logits[0].argmax(dim=0)
|
|
||||||
semantic_segmentation.append(semantic_map)
|
|
||||||
else:
|
|
||||||
semantic_segmentation = segmentation.argmax(dim=1)
|
|
||||||
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
|
|
||||||
|
|
||||||
return semantic_segmentation
|
|
||||||
|
|
||||||
def post_process_instance_segmentation(
|
|
||||||
self,
|
|
||||||
outputs,
|
|
||||||
threshold: float = 0.5,
|
|
||||||
mask_threshold: float = 0.5,
|
|
||||||
overlap_mask_area_threshold: float = 0.8,
|
|
||||||
target_sizes: Optional[List[Tuple[int, int]]] = None,
|
|
||||||
return_coco_annotation: Optional[bool] = False,
|
|
||||||
) -> List[Dict]:
|
|
||||||
"""
|
|
||||||
Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into instance segmentation predictions. Only
|
|
||||||
supports PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`MaskFormerForInstanceSegmentation`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
threshold (`float`, *optional*, defaults to 0.5):
|
|
||||||
The probability score threshold to keep predicted instance masks.
|
|
||||||
mask_threshold (`float`, *optional*, defaults to 0.5):
|
|
||||||
Threshold to use when turning the predicted masks into binary values.
|
|
||||||
overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
|
|
||||||
The overlap mask area threshold to merge or discard small disconnected parts within each binary
|
|
||||||
instance mask.
|
|
||||||
target_sizes (`List[Tuple]`, *optional*):
|
|
||||||
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
|
|
||||||
final size (height, width) of each prediction. If left to None, predictions will not be resized.
|
|
||||||
return_coco_annotation (`bool`, *optional*):
|
|
||||||
Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
|
|
||||||
format.
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
|
|
||||||
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
|
|
||||||
`List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
|
|
||||||
`True`. Set to `None` if no mask if found above `threshold`.
|
|
||||||
- **segments_info** -- A dictionary that contains additional information on each segment.
|
|
||||||
- **id** -- An integer representing the `segment_id`.
|
|
||||||
- **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
|
|
||||||
- **score** -- Prediction score of segment with `segment_id`.
|
|
||||||
"""
|
|
||||||
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
|
||||||
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
|
||||||
|
|
||||||
batch_size = class_queries_logits.shape[0]
|
|
||||||
num_labels = class_queries_logits.shape[-1] - 1
|
|
||||||
|
|
||||||
mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
|
|
||||||
|
|
||||||
# Predicted label and score of each query (batch_size, num_queries)
|
|
||||||
pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
|
|
||||||
|
|
||||||
# Loop over items in batch size
|
|
||||||
results: List[Dict[str, Tensor]] = []
|
|
||||||
|
|
||||||
for i in range(batch_size):
|
|
||||||
mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
|
|
||||||
mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
|
|
||||||
)
|
|
||||||
|
|
||||||
# No mask found
|
|
||||||
if mask_probs_item.shape[0] <= 0:
|
|
||||||
height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
|
|
||||||
segmentation = torch.zeros((height, width)) - 1
|
|
||||||
results.append({"segmentation": segmentation, "segments_info": []})
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get segmentation map and segment information of batch item
|
|
||||||
target_size = target_sizes[i] if target_sizes is not None else None
|
|
||||||
segmentation, segments = compute_segments(
|
|
||||||
mask_probs_item,
|
|
||||||
pred_scores_item,
|
|
||||||
pred_labels_item,
|
|
||||||
mask_threshold,
|
|
||||||
overlap_mask_area_threshold,
|
|
||||||
target_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return segmentation map in run-length encoding (RLE) format
|
|
||||||
if return_coco_annotation:
|
|
||||||
segmentation = convert_segmentation_to_rle(segmentation)
|
|
||||||
|
|
||||||
results.append({"segmentation": segmentation, "segments_info": segments})
|
|
||||||
return results
|
|
||||||
|
|
||||||
def post_process_panoptic_segmentation(
|
|
||||||
self,
|
|
||||||
outputs,
|
|
||||||
threshold: float = 0.5,
|
|
||||||
mask_threshold: float = 0.5,
|
|
||||||
overlap_mask_area_threshold: float = 0.8,
|
|
||||||
label_ids_to_fuse: Optional[Set[int]] = None,
|
|
||||||
target_sizes: Optional[List[Tuple[int, int]]] = None,
|
|
||||||
) -> List[Dict]:
|
|
||||||
"""
|
|
||||||
Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image panoptic segmentation
|
|
||||||
predictions. Only supports PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`MaskFormerForInstanceSegmentationOutput`]):
|
|
||||||
The outputs from [`MaskFormerForInstanceSegmentation`].
|
|
||||||
threshold (`float`, *optional*, defaults to 0.5):
|
|
||||||
The probability score threshold to keep predicted instance masks.
|
|
||||||
mask_threshold (`float`, *optional*, defaults to 0.5):
|
|
||||||
Threshold to use when turning the predicted masks into binary values.
|
|
||||||
overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
|
|
||||||
The overlap mask area threshold to merge or discard small disconnected parts within each binary
|
|
||||||
instance mask.
|
|
||||||
label_ids_to_fuse (`Set[int]`, *optional*):
|
|
||||||
The labels in this state will have all their instances be fused together. For instance we could say
|
|
||||||
there can only be one sky in an image, but several persons, so the label ID for sky would be in that
|
|
||||||
set, but not the one for person.
|
|
||||||
target_sizes (`List[Tuple]`, *optional*):
|
|
||||||
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
|
|
||||||
final size (height, width) of each prediction in batch. If left to None, predictions will not be
|
|
||||||
resized.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
|
|
||||||
- **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id`, set
|
|
||||||
to `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized
|
|
||||||
to the corresponding `target_sizes` entry.
|
|
||||||
- **segments_info** -- A dictionary that contains additional information on each segment.
|
|
||||||
- **id** -- an integer representing the `segment_id`.
|
|
||||||
- **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
|
|
||||||
- **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
|
|
||||||
Multiple instances of the same class / label were fused and assigned a single `segment_id`.
|
|
||||||
- **score** -- Prediction score of segment with `segment_id`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if label_ids_to_fuse is None:
|
|
||||||
logger.warning("`label_ids_to_fuse` unset. No instance will be fused.")
|
|
||||||
label_ids_to_fuse = set()
|
|
||||||
|
|
||||||
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
|
||||||
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
|
||||||
|
|
||||||
batch_size = class_queries_logits.shape[0]
|
|
||||||
num_labels = class_queries_logits.shape[-1] - 1
|
|
||||||
|
|
||||||
mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
|
|
||||||
|
|
||||||
# Predicted label and score of each query (batch_size, num_queries)
|
|
||||||
pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
|
|
||||||
|
|
||||||
# Loop over items in batch size
|
|
||||||
results: List[Dict[str, Tensor]] = []
|
|
||||||
|
|
||||||
for i in range(batch_size):
|
|
||||||
mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
|
|
||||||
mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
|
|
||||||
)
|
|
||||||
|
|
||||||
# No mask found
|
|
||||||
if mask_probs_item.shape[0] <= 0:
|
|
||||||
height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
|
|
||||||
segmentation = torch.zeros((height, width)) - 1
|
|
||||||
results.append({"segmentation": segmentation, "segments_info": []})
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get segmentation map and segment information of batch item
|
|
||||||
target_size = target_sizes[i] if target_sizes is not None else None
|
|
||||||
segmentation, segments = compute_segments(
|
|
||||||
mask_probs_item,
|
|
||||||
pred_scores_item,
|
|
||||||
pred_labels_item,
|
|
||||||
mask_threshold,
|
|
||||||
overlap_mask_area_threshold,
|
|
||||||
label_ids_to_fuse,
|
|
||||||
target_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append({"segmentation": segmentation, "segments_info": segments})
|
|
||||||
return results
|
|
||||||
|
|||||||
1143
src/transformers/models/maskformer/image_processing_maskformer.py
Normal file
1143
src/transformers/models/maskformer/image_processing_maskformer.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -47,6 +47,7 @@ except OptionalDependencyNotAvailable:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_import_structure["feature_extraction_owlvit"] = ["OwlViTFeatureExtractor"]
|
_import_structure["feature_extraction_owlvit"] = ["OwlViTFeatureExtractor"]
|
||||||
|
_import_structure["image_processing_owlvit"] = ["OwlViTImageProcessor"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
@@ -80,6 +81,7 @@ if TYPE_CHECKING:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .feature_extraction_owlvit import OwlViTFeatureExtractor
|
from .feature_extraction_owlvit import OwlViTFeatureExtractor
|
||||||
|
from .image_processing_owlvit import OwlViTImageProcessor
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
|
|||||||
@@ -14,317 +14,11 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Feature extractor class for OwlViT."""
|
"""Feature extractor class for OwlViT."""
|
||||||
|
|
||||||
from typing import List, Optional, Union
|
from ...utils import logging
|
||||||
|
from .image_processing_owlvit import OwlViTImageProcessor
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from transformers.image_utils import PILImageResampling
|
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
|
|
||||||
from ...image_transforms import center_to_corners_format
|
|
||||||
from ...image_utils import ImageFeatureExtractionMixin
|
|
||||||
from ...utils import TensorType, is_torch_available, is_torch_tensor, logging
|
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
|
||||||
import torch
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.modeling_detr._upcast
|
OwlViTFeatureExtractor = OwlViTImageProcessor
|
||||||
def _upcast(t):
|
|
||||||
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
|
|
||||||
if t.is_floating_point():
|
|
||||||
return t if t.dtype in (torch.float32, torch.float64) else t.float()
|
|
||||||
else:
|
|
||||||
return t if t.dtype in (torch.int32, torch.int64) else t.int()
|
|
||||||
|
|
||||||
|
|
||||||
def box_area(boxes):
|
|
||||||
"""
|
|
||||||
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
|
|
||||||
Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
|
|
||||||
< x2` and `0 <= y1 < y2`.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`torch.FloatTensor`: a tensor containing the area for each box.
|
|
||||||
"""
|
|
||||||
boxes = _upcast(boxes)
|
|
||||||
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
|
||||||
|
|
||||||
|
|
||||||
def box_iou(boxes1, boxes2):
|
|
||||||
area1 = box_area(boxes1)
|
|
||||||
area2 = box_area(boxes2)
|
|
||||||
|
|
||||||
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
|
|
||||||
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
|
|
||||||
|
|
||||||
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
|
|
||||||
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
|
|
||||||
|
|
||||||
union = area1[:, None] + area2 - inter
|
|
||||||
|
|
||||||
iou = inter / union
|
|
||||||
return iou, union
|
|
||||||
|
|
||||||
|
|
||||||
class OwlViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
|
|
||||||
r"""
|
|
||||||
Constructs an OWL-ViT feature extractor.
|
|
||||||
|
|
||||||
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
|
|
||||||
should refer to this superclass for more information regarding those methods.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to resize the shorter edge of the input to a certain `size`.
|
|
||||||
size (`int` or `Tuple[int, int]`, *optional*, defaults to (768, 768)):
|
|
||||||
The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
|
|
||||||
sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
|
|
||||||
to (size, size).
|
|
||||||
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
|
|
||||||
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
|
|
||||||
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
|
|
||||||
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
|
|
||||||
to `True`.
|
|
||||||
do_center_crop (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
|
|
||||||
image is padded with 0's and then center cropped.
|
|
||||||
crop_size (`int`, *optional*, defaults to 768):
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to normalize the input with `image_mean` and `image_std`. Desired output size when applying
|
|
||||||
center-cropping. Only has an effect if `do_center_crop` is set to `True`.
|
|
||||||
image_mean (`List[int]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
|
||||||
The sequence of means for each channel, to be used when normalizing images.
|
|
||||||
image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
|
||||||
The sequence of standard deviations for each channel, to be used when normalizing images.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_input_names = ["pixel_values"]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
do_resize=True,
|
|
||||||
size=(768, 768),
|
|
||||||
resample=PILImageResampling.BICUBIC,
|
|
||||||
crop_size=768,
|
|
||||||
do_center_crop=False,
|
|
||||||
do_normalize=True,
|
|
||||||
image_mean=None,
|
|
||||||
image_std=None,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
# Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
|
|
||||||
# vision feature extractor method `rescale` as it would be set as an attribute during the super().__init__
|
|
||||||
# call. This is for backwards compatibility.
|
|
||||||
if "rescale" in kwargs:
|
|
||||||
rescale_val = kwargs.pop("rescale")
|
|
||||||
kwargs["do_rescale"] = rescale_val
|
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.size = size
|
|
||||||
self.resample = resample
|
|
||||||
self.crop_size = crop_size
|
|
||||||
self.do_resize = do_resize
|
|
||||||
self.do_center_crop = do_center_crop
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
|
|
||||||
self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
|
|
||||||
|
|
||||||
def post_process(self, outputs, target_sizes):
|
|
||||||
"""
|
|
||||||
Converts the output of [`OwlViTForObjectDetection`] into the format expected by the COCO api.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`OwlViTObjectDetectionOutput`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
target_sizes (`torch.Tensor`, *optional*):
|
|
||||||
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
|
|
||||||
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
|
|
||||||
None, predictions will not be unnormalized.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
|
||||||
in the batch as predicted by the model.
|
|
||||||
"""
|
|
||||||
logits, boxes = outputs.logits, outputs.pred_boxes
|
|
||||||
|
|
||||||
if len(logits) != len(target_sizes):
|
|
||||||
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
|
||||||
if target_sizes.shape[1] != 2:
|
|
||||||
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
|
||||||
|
|
||||||
probs = torch.max(logits, dim=-1)
|
|
||||||
scores = torch.sigmoid(probs.values)
|
|
||||||
labels = probs.indices
|
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
|
||||||
boxes = center_to_corners_format(boxes)
|
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
def post_process_image_guided_detection(self, outputs, threshold=0.6, nms_threshold=0.3, target_sizes=None):
|
|
||||||
"""
|
|
||||||
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
|
|
||||||
api.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
threshold (`float`, *optional*, defaults to 0.6):
|
|
||||||
Minimum confidence threshold to use to filter out predicted boxes.
|
|
||||||
nms_threshold (`float`, *optional*, defaults to 0.3):
|
|
||||||
IoU threshold for non-maximum suppression of overlapping boxes.
|
|
||||||
target_sizes (`torch.Tensor`, *optional*):
|
|
||||||
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
|
|
||||||
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
|
|
||||||
None, predictions will not be unnormalized.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
|
||||||
in the batch as predicted by the model. All labels are set to None as
|
|
||||||
`OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
|
|
||||||
"""
|
|
||||||
logits, target_boxes = outputs.logits, outputs.target_pred_boxes
|
|
||||||
|
|
||||||
if len(logits) != len(target_sizes):
|
|
||||||
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
|
||||||
if target_sizes.shape[1] != 2:
|
|
||||||
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
|
||||||
|
|
||||||
probs = torch.max(logits, dim=-1)
|
|
||||||
scores = torch.sigmoid(probs.values)
|
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
|
||||||
target_boxes = center_to_corners_format(target_boxes)
|
|
||||||
|
|
||||||
# Apply non-maximum suppression (NMS)
|
|
||||||
if nms_threshold < 1.0:
|
|
||||||
for idx in range(target_boxes.shape[0]):
|
|
||||||
for i in torch.argsort(-scores[idx]):
|
|
||||||
if not scores[idx][i]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0]
|
|
||||||
ious[i] = -1.0 # Mask self-IoU.
|
|
||||||
scores[idx][ious > nms_threshold] = 0.0
|
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
|
||||||
target_boxes = target_boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
# Compute box display alphas based on prediction scores
|
|
||||||
results = []
|
|
||||||
alphas = torch.zeros_like(scores)
|
|
||||||
|
|
||||||
for idx in range(target_boxes.shape[0]):
|
|
||||||
# Select scores for boxes matching the current query:
|
|
||||||
query_scores = scores[idx]
|
|
||||||
if not query_scores.nonzero().numel():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
|
|
||||||
# All other boxes will either belong to a different query, or will not be shown.
|
|
||||||
max_score = torch.max(query_scores) + 1e-6
|
|
||||||
query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
|
|
||||||
query_alphas[query_alphas < threshold] = 0.0
|
|
||||||
query_alphas = torch.clip(query_alphas, 0.0, 1.0)
|
|
||||||
alphas[idx] = query_alphas
|
|
||||||
|
|
||||||
mask = alphas[idx] > 0
|
|
||||||
box_scores = alphas[idx][mask]
|
|
||||||
boxes = target_boxes[idx][mask]
|
|
||||||
results.append({"scores": box_scores, "labels": None, "boxes": boxes})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
images: Union[
|
|
||||||
Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa
|
|
||||||
],
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
**kwargs
|
|
||||||
) -> BatchFeature:
|
|
||||||
"""
|
|
||||||
Main method to prepare for the model one or several image(s).
|
|
||||||
|
|
||||||
<Tip warning={true}>
|
|
||||||
|
|
||||||
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
|
|
||||||
PIL images.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
||||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
||||||
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W) or (H, W, C),
|
|
||||||
where C is a number of channels, H and W are image height and width.
|
|
||||||
|
|
||||||
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
|
|
||||||
If set, will return tensors of a particular framework. Acceptable values are:
|
|
||||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
|
||||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
||||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
|
||||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
"""
|
|
||||||
# Input type checking for clearer error
|
|
||||||
valid_images = False
|
|
||||||
|
|
||||||
# Check that images has a valid type
|
|
||||||
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
|
|
||||||
valid_images = True
|
|
||||||
elif isinstance(images, (list, tuple)):
|
|
||||||
if isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
|
|
||||||
valid_images = True
|
|
||||||
|
|
||||||
if not valid_images:
|
|
||||||
raise ValueError(
|
|
||||||
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
|
|
||||||
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
|
|
||||||
)
|
|
||||||
|
|
||||||
is_batched = bool(
|
|
||||||
isinstance(images, (list, tuple))
|
|
||||||
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_batched:
|
|
||||||
images = [images]
|
|
||||||
|
|
||||||
# transformations (resizing + center cropping + normalization)
|
|
||||||
if self.do_resize and self.size is not None and self.resample is not None:
|
|
||||||
images = [
|
|
||||||
self.resize(image=image, size=self.size, resample=self.resample, default_to_square=True)
|
|
||||||
for image in images
|
|
||||||
]
|
|
||||||
if self.do_center_crop and self.crop_size is not None:
|
|
||||||
images = [self.center_crop(image, self.crop_size) for image in images]
|
|
||||||
if self.do_normalize:
|
|
||||||
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
|
|
||||||
|
|
||||||
# return as BatchFeature
|
|
||||||
data = {"pixel_values": images}
|
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|||||||
445
src/transformers/models/owlvit/image_processing_owlvit.py
Normal file
445
src/transformers/models/owlvit/image_processing_owlvit.py
Normal file
@@ -0,0 +1,445 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Image processor class for OwlViT"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||||
|
from transformers.image_transforms import (
|
||||||
|
center_crop,
|
||||||
|
center_to_corners_format,
|
||||||
|
normalize,
|
||||||
|
rescale,
|
||||||
|
resize,
|
||||||
|
to_channel_dimension_format,
|
||||||
|
to_numpy_array,
|
||||||
|
)
|
||||||
|
from transformers.image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, valid_images
|
||||||
|
from transformers.utils import TensorType, is_torch_available, logging
|
||||||
|
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Copied from transformers.models.detr.modeling_detr._upcast
|
||||||
|
def _upcast(t):
|
||||||
|
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
|
||||||
|
if t.is_floating_point():
|
||||||
|
return t if t.dtype in (torch.float32, torch.float64) else t.float()
|
||||||
|
else:
|
||||||
|
return t if t.dtype in (torch.int32, torch.int64) else t.int()
|
||||||
|
|
||||||
|
|
||||||
|
def box_area(boxes):
|
||||||
|
"""
|
||||||
|
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
|
||||||
|
Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
|
||||||
|
< x2` and `0 <= y1 < y2`.
|
||||||
|
Returns:
|
||||||
|
`torch.FloatTensor`: a tensor containing the area for each box.
|
||||||
|
"""
|
||||||
|
boxes = _upcast(boxes)
|
||||||
|
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
||||||
|
|
||||||
|
|
||||||
|
def box_iou(boxes1, boxes2):
|
||||||
|
area1 = box_area(boxes1)
|
||||||
|
area2 = box_area(boxes2)
|
||||||
|
|
||||||
|
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
|
||||||
|
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
|
||||||
|
|
||||||
|
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
|
||||||
|
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
|
||||||
|
|
||||||
|
union = area1[:, None] + area2 - inter
|
||||||
|
|
||||||
|
iou = inter / union
|
||||||
|
return iou, union
|
||||||
|
|
||||||
|
|
||||||
|
class OwlViTImageProcessor(BaseImageProcessor):
|
||||||
|
r"""
|
||||||
|
Constructs an OWL-ViT feature extractor.
|
||||||
|
|
||||||
|
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
|
||||||
|
should refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to resize the shorter edge of the input to a certain `size`.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to {"height": 768, "width": 768}):
|
||||||
|
The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
|
||||||
|
sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
|
||||||
|
to (size, size).
|
||||||
|
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
|
||||||
|
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
|
||||||
|
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
|
||||||
|
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
|
||||||
|
to `True`.
|
||||||
|
do_center_crop (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
|
||||||
|
image is padded with 0's and then center cropped.
|
||||||
|
crop_size (`int`, *optional*, defaults to {"height": 768, "width": 768}):
|
||||||
|
The size to use for center cropping the image. Only has an effect if `do_center_crop` is set to `True`.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to rescale the input by a certain factor.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `1/255`):
|
||||||
|
The factor to use for rescaling the image. Only has an effect if `do_rescale` is set to `True`.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to normalize the input with `image_mean` and `image_std`. Desired output size when applying
|
||||||
|
center-cropping. Only has an effect if `do_center_crop` is set to `True`.
|
||||||
|
image_mean (`List[int]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||||
|
The sequence of means for each channel, to be used when normalizing images.
|
||||||
|
image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||||
|
The sequence of standard deviations for each channel, to be used when normalizing images.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_input_names = ["pixel_values"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_resize=True,
|
||||||
|
size=None,
|
||||||
|
resample=PILImageResampling.BICUBIC,
|
||||||
|
do_center_crop=False,
|
||||||
|
crop_size=None,
|
||||||
|
do_rescale=True,
|
||||||
|
rescale_factor=1 / 255,
|
||||||
|
do_normalize=True,
|
||||||
|
image_mean=None,
|
||||||
|
image_std=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
size = size if size is not None else {"height": 768, "width": 768}
|
||||||
|
size = get_size_dict(size, default_to_square=True)
|
||||||
|
|
||||||
|
crop_size = crop_size if crop_size is not None else {"height": 768, "width": 768}
|
||||||
|
crop_size = get_size_dict(crop_size, default_to_square=True)
|
||||||
|
|
||||||
|
# Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
|
||||||
|
# vision feature extractor method `rescale` as it would be set as an attribute during the super().__init__
|
||||||
|
# call. This is for backwards compatibility.
|
||||||
|
if "rescale" in kwargs:
|
||||||
|
rescale_val = kwargs.pop("rescale")
|
||||||
|
kwargs["do_rescale"] = rescale_val
|
||||||
|
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.size = size
|
||||||
|
self.resample = resample
|
||||||
|
self.do_center_crop = do_center_crop
|
||||||
|
self.crop_size = crop_size
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
|
||||||
|
def resize(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
size: Dict[str, int],
|
||||||
|
resample: PILImageResampling.BICUBIC,
|
||||||
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Resize an image to a certain size.
|
||||||
|
"""
|
||||||
|
size = get_size_dict(size, default_to_square=True)
|
||||||
|
if "height" not in size or "width" not in size:
|
||||||
|
raise ValueError("size dictionary must contain height and width keys")
|
||||||
|
|
||||||
|
return resize(image, (size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs)
|
||||||
|
|
||||||
|
def center_crop(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
crop_size: Dict[str, int],
|
||||||
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Center crop an image to a certain size.
|
||||||
|
"""
|
||||||
|
crop_size = get_size_dict(crop_size, default_to_square=True)
|
||||||
|
if "height" not in crop_size or "width" not in crop_size:
|
||||||
|
raise ValueError("crop_size dictionary must contain height and width keys")
|
||||||
|
|
||||||
|
return center_crop(image, (crop_size["height"], crop_size["width"]), data_format=data_format, **kwargs)
|
||||||
|
|
||||||
|
def rescale(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
rescale_factor: float,
|
||||||
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Rescale an image by a certain factor.
|
||||||
|
"""
|
||||||
|
return rescale(image, rescale_factor, data_format=data_format, **kwargs)
|
||||||
|
|
||||||
|
def normalize(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
mean: List[float],
|
||||||
|
std: List[float],
|
||||||
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Normalize an image with a certain mean and standard deviation.
|
||||||
|
"""
|
||||||
|
return normalize(image, mean, std, data_format=data_format, **kwargs)
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Optional[Dict[str, int]] = None,
|
||||||
|
resample: PILImageResampling = None,
|
||||||
|
do_center_crop: Optional[bool] = None,
|
||||||
|
crop_size: Optional[Dict[str, int]] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||||
|
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Prepares an image or batch of images for the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
The image or batch of images to be prepared.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether or not to resize the input. If `True`, will resize the input to the size specified by `size`.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
|
The size to resize the input to. Only has an effect if `do_resize` is set to `True`.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||||
|
The resampling filter to use when resizing the input. Only has an effect if `do_resize` is set to
|
||||||
|
`True`.
|
||||||
|
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
|
||||||
|
Whether or not to center crop the input. If `True`, will center crop the input to the size specified by
|
||||||
|
`crop_size`.
|
||||||
|
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
|
||||||
|
The size to center crop the input to. Only has an effect if `do_center_crop` is set to `True`.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether or not to rescale the input. If `True`, will rescale the input by dividing it by
|
||||||
|
`rescale_factor`.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
The factor to rescale the input by. Only has an effect if `do_rescale` is set to `True`.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether or not to normalize the input. If `True`, will normalize the input by subtracting `image_mean`
|
||||||
|
and dividing by `image_std`.
|
||||||
|
image_mean (`Union[float, List[float]]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
The mean to subtract from the input when normalizing. Only has an effect if `do_normalize` is set to
|
||||||
|
`True`.
|
||||||
|
image_std (`Union[float, List[float]]`, *optional*, defaults to `self.image_std`):
|
||||||
|
The standard deviation to divide the input by when normalizing. Only has an effect if `do_normalize` is
|
||||||
|
set to `True`.
|
||||||
|
return_tensors (`str` or `TensorType`, *optional*):
|
||||||
|
The type of tensors to return. Can be one of:
|
||||||
|
- Unset: Return a list of `np.ndarray`.
|
||||||
|
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||||
|
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||||
|
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||||
|
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||||
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. Can be one of:
|
||||||
|
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- Unset: defaults to the channel dimension format of the input image.
|
||||||
|
"""
|
||||||
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||||
|
size = size if size is not None else self.size
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
|
||||||
|
crop_size = crop_size if crop_size is not None else self.crop_size
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
|
||||||
|
if do_resize is not None and size is None:
|
||||||
|
raise ValueError("Size and max_size must be specified if do_resize is True.")
|
||||||
|
|
||||||
|
if do_center_crop is not None and crop_size is None:
|
||||||
|
raise ValueError("Crop size must be specified if do_center_crop is True.")
|
||||||
|
|
||||||
|
if do_rescale is not None and rescale_factor is None:
|
||||||
|
raise ValueError("Rescale factor must be specified if do_rescale is True.")
|
||||||
|
|
||||||
|
if do_normalize is not None and (image_mean is None or image_std is None):
|
||||||
|
raise ValueError("Image mean and std must be specified if do_normalize is True.")
|
||||||
|
|
||||||
|
if not is_batched(images):
|
||||||
|
images = [images]
|
||||||
|
|
||||||
|
if not valid_images(images):
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||||
|
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||||
|
)
|
||||||
|
|
||||||
|
# All transformations expect numpy arrays
|
||||||
|
images = [to_numpy_array(image) for image in images]
|
||||||
|
|
||||||
|
if do_resize:
|
||||||
|
images = [self.resize(image, size=size, resample=resample) for image in images]
|
||||||
|
|
||||||
|
if do_center_crop:
|
||||||
|
images = [self.center_crop(image, crop_size=crop_size) for image in images]
|
||||||
|
|
||||||
|
if do_rescale:
|
||||||
|
images = [self.rescale(image, rescale_factor=rescale_factor) for image in images]
|
||||||
|
|
||||||
|
if do_normalize:
|
||||||
|
images = [self.normalize(image, mean=image_mean, std=image_std) for image in images]
|
||||||
|
|
||||||
|
images = [to_channel_dimension_format(image, data_format) for image in images]
|
||||||
|
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|
||||||
|
return encoded_inputs
|
||||||
|
|
||||||
|
def post_process(self, outputs, target_sizes):
|
||||||
|
"""
|
||||||
|
Converts the output of [`OwlViTForObjectDetection`] into the format expected by the COCO api.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`OwlViTObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
|
||||||
|
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
|
||||||
|
image size (before any data augmentation). For visualization, this should be the image size after data
|
||||||
|
augment, but before padding.
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model.
|
||||||
|
"""
|
||||||
|
# TODO: (amy) add support for other frameworks
|
||||||
|
logits, boxes = outputs.logits, outputs.pred_boxes
|
||||||
|
|
||||||
|
if len(logits) != len(target_sizes):
|
||||||
|
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||||
|
if target_sizes.shape[1] != 2:
|
||||||
|
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
||||||
|
|
||||||
|
probs = torch.max(logits, dim=-1)
|
||||||
|
scores = torch.sigmoid(probs.values)
|
||||||
|
labels = probs.indices
|
||||||
|
|
||||||
|
# Convert to [x0, y0, x1, y1] format
|
||||||
|
boxes = center_to_corners_format(boxes)
|
||||||
|
|
||||||
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||||
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
# TODO: (Amy) Make compatible with other frameworks
|
||||||
|
def post_process_image_guided_detection(self, outputs, threshold=0.6, nms_threshold=0.3, target_sizes=None):
|
||||||
|
"""
|
||||||
|
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
|
||||||
|
api.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
|
||||||
|
Raw outputs of the model.
|
||||||
|
threshold (`float`, *optional*, defaults to 0.6):
|
||||||
|
Minimum confidence threshold to use to filter out predicted boxes.
|
||||||
|
nms_threshold (`float`, *optional*, defaults to 0.3):
|
||||||
|
IoU threshold for non-maximum suppression of overlapping boxes.
|
||||||
|
target_sizes (`torch.Tensor`, *optional*):
|
||||||
|
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
|
||||||
|
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
|
||||||
|
None, predictions will not be unnormalized.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
||||||
|
in the batch as predicted by the model. All labels are set to None as
|
||||||
|
`OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
|
||||||
|
"""
|
||||||
|
logits, target_boxes = outputs.logits, outputs.target_pred_boxes
|
||||||
|
|
||||||
|
if len(logits) != len(target_sizes):
|
||||||
|
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||||
|
if target_sizes.shape[1] != 2:
|
||||||
|
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
||||||
|
|
||||||
|
probs = torch.max(logits, dim=-1)
|
||||||
|
scores = torch.sigmoid(probs.values)
|
||||||
|
|
||||||
|
# Convert to [x0, y0, x1, y1] format
|
||||||
|
target_boxes = center_to_corners_format(target_boxes)
|
||||||
|
|
||||||
|
# Apply non-maximum suppression (NMS)
|
||||||
|
if nms_threshold < 1.0:
|
||||||
|
for idx in range(target_boxes.shape[0]):
|
||||||
|
for i in torch.argsort(-scores[idx]):
|
||||||
|
if not scores[idx][i]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0]
|
||||||
|
ious[i] = -1.0 # Mask self-IoU.
|
||||||
|
scores[idx][ious > nms_threshold] = 0.0
|
||||||
|
|
||||||
|
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
||||||
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||||
|
target_boxes = target_boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
|
# Compute box display alphas based on prediction scores
|
||||||
|
results = []
|
||||||
|
alphas = torch.zeros_like(scores)
|
||||||
|
|
||||||
|
for idx in range(target_boxes.shape[0]):
|
||||||
|
# Select scores for boxes matching the current query:
|
||||||
|
query_scores = scores[idx]
|
||||||
|
if not query_scores.nonzero().numel():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
|
||||||
|
# All other boxes will either belong to a different query, or will not be shown.
|
||||||
|
max_score = torch.max(query_scores) + 1e-6
|
||||||
|
query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
|
||||||
|
query_alphas[query_alphas < threshold] = 0.0
|
||||||
|
query_alphas = torch.clip(query_alphas, 0.0, 1.0)
|
||||||
|
alphas[idx] = query_alphas
|
||||||
|
|
||||||
|
mask = alphas[idx] > 0
|
||||||
|
box_scores = alphas[idx][mask]
|
||||||
|
boxes = target_boxes[idx][mask]
|
||||||
|
results.append({"scores": box_scores, "labels": None, "boxes": boxes})
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -15,6 +15,7 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for OWL-ViT
|
Image/Text processor class for OWL-ViT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -33,15 +34,15 @@ class OwlViTProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_extractor ([`OwlViTFeatureExtractor`]):
|
feature_extractor ([`OwlViTFeatureExtractor`]):
|
||||||
The feature extractor is a required input.
|
The image processor is a required input.
|
||||||
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
|
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
feature_extractor_class = "OwlViTFeatureExtractor"
|
feature_extractor_class = "OwlViTFeatureExtractor"
|
||||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(feature_extractor, tokenizer)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
|
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -287,7 +287,6 @@ class SegformerImageProcessor(BaseImageProcessor):
|
|||||||
do_reduce_labels: bool = None,
|
do_reduce_labels: bool = None,
|
||||||
do_resize: bool = None,
|
do_resize: bool = None,
|
||||||
size: Dict[str, int] = None,
|
size: Dict[str, int] = None,
|
||||||
resample: PILImageResampling = None,
|
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Preprocesses a single mask."""
|
"""Preprocesses a single mask."""
|
||||||
segmentation_map = to_numpy_array(segmentation_map)
|
segmentation_map = to_numpy_array(segmentation_map)
|
||||||
@@ -301,7 +300,7 @@ class SegformerImageProcessor(BaseImageProcessor):
|
|||||||
image=segmentation_map,
|
image=segmentation_map,
|
||||||
do_reduce_labels=do_reduce_labels,
|
do_reduce_labels=do_reduce_labels,
|
||||||
do_resize=do_resize,
|
do_resize=do_resize,
|
||||||
resample=PIL.Image.NEAREST,
|
resample=PILImageResampling.NEAREST,
|
||||||
size=size,
|
size=size,
|
||||||
do_rescale=False,
|
do_rescale=False,
|
||||||
do_normalize=False,
|
do_normalize=False,
|
||||||
@@ -438,7 +437,6 @@ class SegformerImageProcessor(BaseImageProcessor):
|
|||||||
segmentation_map=segmentation_map,
|
segmentation_map=segmentation_map,
|
||||||
do_reduce_labels=do_reduce_labels,
|
do_reduce_labels=do_reduce_labels,
|
||||||
do_resize=do_resize,
|
do_resize=do_resize,
|
||||||
resample=PIL.Image.NEAREST,
|
|
||||||
size=size,
|
size=size,
|
||||||
)
|
)
|
||||||
for segmentation_map in segmentation_maps
|
for segmentation_map in segmentation_maps
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ except OptionalDependencyNotAvailable:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_import_structure["feature_extraction_yolos"] = ["YolosFeatureExtractor"]
|
_import_structure["feature_extraction_yolos"] = ["YolosFeatureExtractor"]
|
||||||
|
_import_structure["image_processing_yolos"] = ["YolosImageProcessor"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
@@ -54,6 +55,7 @@ if TYPE_CHECKING:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .feature_extraction_yolos import YolosFeatureExtractor
|
from .feature_extraction_yolos import YolosFeatureExtractor
|
||||||
|
from .image_processing_yolos import YolosImageProcessor
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
|
|||||||
@@ -14,694 +14,11 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Feature extractor class for YOLOS."""
|
"""Feature extractor class for YOLOS."""
|
||||||
|
|
||||||
import pathlib
|
from ...utils import logging
|
||||||
import warnings
|
from .image_processing_yolos import YolosImageProcessor
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
|
|
||||||
from ...image_transforms import center_to_corners_format, corners_to_center_format, rgb_to_id
|
|
||||||
from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
|
|
||||||
from ...utils import TensorType, is_torch_available, logging
|
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
|
||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
|
YolosFeatureExtractor = YolosImageProcessor
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.masks_to_boxes
|
|
||||||
def masks_to_boxes(masks):
|
|
||||||
"""
|
|
||||||
Compute the bounding boxes around the provided panoptic segmentation masks.
|
|
||||||
|
|
||||||
The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
|
|
||||||
|
|
||||||
Returns a [N, 4] tensor, with the boxes in corner (xyxy) format.
|
|
||||||
"""
|
|
||||||
if masks.size == 0:
|
|
||||||
return np.zeros((0, 4))
|
|
||||||
|
|
||||||
h, w = masks.shape[-2:]
|
|
||||||
|
|
||||||
y = np.arange(0, h, dtype=np.float32)
|
|
||||||
x = np.arange(0, w, dtype=np.float32)
|
|
||||||
# see https://github.com/pytorch/pytorch/issues/50276
|
|
||||||
y, x = np.meshgrid(y, x, indexing="ij")
|
|
||||||
|
|
||||||
x_mask = masks * np.expand_dims(x, axis=0)
|
|
||||||
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
|
|
||||||
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
|
|
||||||
x_min = x.filled(fill_value=1e8)
|
|
||||||
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
|
|
||||||
|
|
||||||
y_mask = masks * np.expand_dims(y, axis=0)
|
|
||||||
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
|
|
||||||
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
|
|
||||||
y_min = y.filled(fill_value=1e8)
|
|
||||||
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
|
|
||||||
|
|
||||||
return np.stack([x_min, y_min, x_max, y_max], 1)
|
|
||||||
|
|
||||||
|
|
||||||
class YolosFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
|
|
||||||
r"""
|
|
||||||
Constructs a YOLOS feature extractor.
|
|
||||||
|
|
||||||
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
|
|
||||||
should refer to this superclass for more information regarding those methods.
|
|
||||||
|
|
||||||
|
|
||||||
Args:
|
|
||||||
format (`str`, *optional*, defaults to `"coco_detection"`):
|
|
||||||
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
|
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to resize the input to a certain `size`.
|
|
||||||
size (`int`, *optional*, defaults to 800):
|
|
||||||
Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
|
|
||||||
sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
|
|
||||||
the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
|
|
||||||
height / width, size)`.
|
|
||||||
max_size (`int`, *optional*, defaults to `1333`):
|
|
||||||
The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
|
|
||||||
set to `True`.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to normalize the input with mean and standard deviation.
|
|
||||||
image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
|
|
||||||
The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
|
|
||||||
image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
|
|
||||||
The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
|
|
||||||
ImageNet std.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_input_names = ["pixel_values"]
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.__init__
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
format="coco_detection",
|
|
||||||
do_resize=True,
|
|
||||||
size=800,
|
|
||||||
max_size=1333,
|
|
||||||
do_normalize=True,
|
|
||||||
image_mean=None,
|
|
||||||
image_std=None,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.format = self._is_valid_format(format)
|
|
||||||
self.do_resize = do_resize
|
|
||||||
self.size = size
|
|
||||||
self.max_size = max_size
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406] # ImageNet mean
|
|
||||||
self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225] # ImageNet std
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._is_valid_format
|
|
||||||
def _is_valid_format(self, format):
|
|
||||||
if format not in ["coco_detection", "coco_panoptic"]:
|
|
||||||
raise ValueError(f"Format {format} not supported")
|
|
||||||
return format
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare
|
|
||||||
def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
|
|
||||||
if self.format == "coco_detection":
|
|
||||||
image, target = self.prepare_coco_detection(image, target, return_segmentation_masks)
|
|
||||||
return image, target
|
|
||||||
elif self.format == "coco_panoptic":
|
|
||||||
image, target = self.prepare_coco_panoptic(image, target, masks_path)
|
|
||||||
return image, target
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Format {self.format} not supported")
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.convert_coco_poly_to_mask
|
|
||||||
def convert_coco_poly_to_mask(self, segmentations, height, width):
|
|
||||||
|
|
||||||
try:
|
|
||||||
from pycocotools import mask as coco_mask
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError("Pycocotools is not installed in your environment.")
|
|
||||||
|
|
||||||
masks = []
|
|
||||||
for polygons in segmentations:
|
|
||||||
rles = coco_mask.frPyObjects(polygons, height, width)
|
|
||||||
mask = coco_mask.decode(rles)
|
|
||||||
if len(mask.shape) < 3:
|
|
||||||
mask = mask[..., None]
|
|
||||||
mask = np.asarray(mask, dtype=np.uint8)
|
|
||||||
mask = np.any(mask, axis=2)
|
|
||||||
masks.append(mask)
|
|
||||||
if masks:
|
|
||||||
masks = np.stack(masks, axis=0)
|
|
||||||
else:
|
|
||||||
masks = np.zeros((0, height, width), dtype=np.uint8)
|
|
||||||
|
|
||||||
return masks
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_detection
|
|
||||||
def prepare_coco_detection(self, image, target, return_segmentation_masks=False):
|
|
||||||
"""
|
|
||||||
Convert the target in COCO format into the format expected by DETR.
|
|
||||||
"""
|
|
||||||
w, h = image.size
|
|
||||||
|
|
||||||
image_id = target["image_id"]
|
|
||||||
image_id = np.asarray([image_id], dtype=np.int64)
|
|
||||||
|
|
||||||
# get all COCO annotations for the given image
|
|
||||||
anno = target["annotations"]
|
|
||||||
|
|
||||||
anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
|
|
||||||
|
|
||||||
boxes = [obj["bbox"] for obj in anno]
|
|
||||||
# guard against no boxes via resizing
|
|
||||||
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
|
|
||||||
boxes[:, 2:] += boxes[:, :2]
|
|
||||||
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=w)
|
|
||||||
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=h)
|
|
||||||
|
|
||||||
classes = [obj["category_id"] for obj in anno]
|
|
||||||
classes = np.asarray(classes, dtype=np.int64)
|
|
||||||
|
|
||||||
if return_segmentation_masks:
|
|
||||||
segmentations = [obj["segmentation"] for obj in anno]
|
|
||||||
masks = self.convert_coco_poly_to_mask(segmentations, h, w)
|
|
||||||
|
|
||||||
keypoints = None
|
|
||||||
if anno and "keypoints" in anno[0]:
|
|
||||||
keypoints = [obj["keypoints"] for obj in anno]
|
|
||||||
keypoints = np.asarray(keypoints, dtype=np.float32)
|
|
||||||
num_keypoints = keypoints.shape[0]
|
|
||||||
if num_keypoints:
|
|
||||||
keypoints = keypoints.reshape((-1, 3))
|
|
||||||
|
|
||||||
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
|
||||||
boxes = boxes[keep]
|
|
||||||
classes = classes[keep]
|
|
||||||
if return_segmentation_masks:
|
|
||||||
masks = masks[keep]
|
|
||||||
if keypoints is not None:
|
|
||||||
keypoints = keypoints[keep]
|
|
||||||
|
|
||||||
target = {}
|
|
||||||
target["boxes"] = boxes
|
|
||||||
target["class_labels"] = classes
|
|
||||||
if return_segmentation_masks:
|
|
||||||
target["masks"] = masks
|
|
||||||
target["image_id"] = image_id
|
|
||||||
if keypoints is not None:
|
|
||||||
target["keypoints"] = keypoints
|
|
||||||
|
|
||||||
# for conversion to coco api
|
|
||||||
area = np.asarray([obj["area"] for obj in anno], dtype=np.float32)
|
|
||||||
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno], dtype=np.int64)
|
|
||||||
target["area"] = area[keep]
|
|
||||||
target["iscrowd"] = iscrowd[keep]
|
|
||||||
|
|
||||||
target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.prepare_coco_panoptic
|
|
||||||
def prepare_coco_panoptic(self, image, target, masks_path, return_masks=True):
|
|
||||||
w, h = image.size
|
|
||||||
ann_info = target.copy()
|
|
||||||
ann_path = pathlib.Path(masks_path) / ann_info["file_name"]
|
|
||||||
|
|
||||||
if "segments_info" in ann_info:
|
|
||||||
masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
|
|
||||||
masks = rgb_to_id(masks)
|
|
||||||
|
|
||||||
ids = np.array([ann["id"] for ann in ann_info["segments_info"]])
|
|
||||||
masks = masks == ids[:, None, None]
|
|
||||||
masks = np.asarray(masks, dtype=np.uint8)
|
|
||||||
|
|
||||||
labels = np.asarray([ann["category_id"] for ann in ann_info["segments_info"]], dtype=np.int64)
|
|
||||||
|
|
||||||
target = {}
|
|
||||||
target["image_id"] = np.asarray(
|
|
||||||
[ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]], dtype=np.int64
|
|
||||||
)
|
|
||||||
if return_masks:
|
|
||||||
target["masks"] = masks
|
|
||||||
target["class_labels"] = labels
|
|
||||||
|
|
||||||
target["boxes"] = masks_to_boxes(masks)
|
|
||||||
|
|
||||||
target["size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
target["orig_size"] = np.asarray([int(h), int(w)], dtype=np.int64)
|
|
||||||
if "segments_info" in ann_info:
|
|
||||||
target["iscrowd"] = np.asarray([ann["iscrowd"] for ann in ann_info["segments_info"]], dtype=np.int64)
|
|
||||||
target["area"] = np.asarray([ann["area"] for ann in ann_info["segments_info"]], dtype=np.float32)
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize
|
|
||||||
def _resize(self, image, size, target=None, max_size=None):
|
|
||||||
"""
|
|
||||||
Resize the image to the given size. Size can be min_size (scalar) or (w, h) tuple. If size is an int, smaller
|
|
||||||
edge of the image will be matched to this number.
|
|
||||||
|
|
||||||
If given, also resize the target accordingly.
|
|
||||||
"""
|
|
||||||
if not isinstance(image, Image.Image):
|
|
||||||
image = self.to_pil_image(image)
|
|
||||||
|
|
||||||
def get_size_with_aspect_ratio(image_size, size, max_size=None):
|
|
||||||
w, h = image_size
|
|
||||||
if max_size is not None:
|
|
||||||
min_original_size = float(min((w, h)))
|
|
||||||
max_original_size = float(max((w, h)))
|
|
||||||
if max_original_size / min_original_size * size > max_size:
|
|
||||||
size = int(round(max_size * min_original_size / max_original_size))
|
|
||||||
|
|
||||||
if (w <= h and w == size) or (h <= w and h == size):
|
|
||||||
return (h, w)
|
|
||||||
|
|
||||||
if w < h:
|
|
||||||
ow = size
|
|
||||||
oh = int(size * h / w)
|
|
||||||
else:
|
|
||||||
oh = size
|
|
||||||
ow = int(size * w / h)
|
|
||||||
|
|
||||||
return (oh, ow)
|
|
||||||
|
|
||||||
def get_size(image_size, size, max_size=None):
|
|
||||||
if isinstance(size, (list, tuple)):
|
|
||||||
return size
|
|
||||||
else:
|
|
||||||
# size returned must be (w, h) since we use PIL to resize images
|
|
||||||
# so we revert the tuple
|
|
||||||
return get_size_with_aspect_ratio(image_size, size, max_size)[::-1]
|
|
||||||
|
|
||||||
size = get_size(image.size, size, max_size)
|
|
||||||
rescaled_image = self.resize(image, size=size)
|
|
||||||
|
|
||||||
if target is None:
|
|
||||||
return rescaled_image, None
|
|
||||||
|
|
||||||
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
|
|
||||||
ratio_width, ratio_height = ratios
|
|
||||||
|
|
||||||
target = target.copy()
|
|
||||||
if "boxes" in target:
|
|
||||||
boxes = target["boxes"]
|
|
||||||
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
|
|
||||||
target["boxes"] = scaled_boxes
|
|
||||||
|
|
||||||
if "area" in target:
|
|
||||||
area = target["area"]
|
|
||||||
scaled_area = area * (ratio_width * ratio_height)
|
|
||||||
target["area"] = scaled_area
|
|
||||||
|
|
||||||
w, h = size
|
|
||||||
target["size"] = np.asarray([h, w], dtype=np.int64)
|
|
||||||
|
|
||||||
if "masks" in target:
|
|
||||||
# use PyTorch as current workaround
|
|
||||||
# TODO replace by self.resize
|
|
||||||
masks = torch.from_numpy(target["masks"][:, None]).float()
|
|
||||||
interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5
|
|
||||||
target["masks"] = interpolated_masks.numpy()
|
|
||||||
|
|
||||||
return rescaled_image, target
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._normalize
|
|
||||||
def _normalize(self, image, mean, std, target=None):
|
|
||||||
"""
|
|
||||||
Normalize the image with a certain mean and std.
|
|
||||||
|
|
||||||
If given, also normalize the target bounding boxes based on the size of the image.
|
|
||||||
"""
|
|
||||||
|
|
||||||
image = self.normalize(image, mean=mean, std=std)
|
|
||||||
if target is None:
|
|
||||||
return image, None
|
|
||||||
|
|
||||||
target = target.copy()
|
|
||||||
h, w = image.shape[-2:]
|
|
||||||
|
|
||||||
if "boxes" in target:
|
|
||||||
boxes = target["boxes"]
|
|
||||||
boxes = corners_to_center_format(boxes)
|
|
||||||
boxes = boxes / np.asarray([w, h, w, h], dtype=np.float32)
|
|
||||||
target["boxes"] = boxes
|
|
||||||
|
|
||||||
return image, target
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
images: ImageInput,
|
|
||||||
annotations: Union[List[Dict], List[List[Dict]]] = None,
|
|
||||||
return_segmentation_masks: Optional[bool] = False,
|
|
||||||
masks_path: Optional[pathlib.Path] = None,
|
|
||||||
padding: Optional[bool] = True,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> BatchFeature:
|
|
||||||
"""
|
|
||||||
Main method to prepare for the model one or several image(s) and optional annotations. Images are by default
|
|
||||||
padded up to the largest image in a batch.
|
|
||||||
|
|
||||||
<Tip warning={true}>
|
|
||||||
|
|
||||||
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
|
|
||||||
PIL images.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
||||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
||||||
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
|
|
||||||
number of channels, H and W are image height and width.
|
|
||||||
|
|
||||||
annotations (`Dict`, `List[Dict]`, *optional*):
|
|
||||||
The corresponding annotations in COCO format.
|
|
||||||
|
|
||||||
In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for
|
|
||||||
each image should have the following format: {'image_id': int, 'annotations': [annotation]}, with the
|
|
||||||
annotations being a list of COCO object annotations.
|
|
||||||
|
|
||||||
In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for
|
|
||||||
each image should have the following format: {'image_id': int, 'file_name': str, 'segments_info':
|
|
||||||
[segment_info]} with segments_info being a list of COCO panoptic annotations.
|
|
||||||
|
|
||||||
return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
|
|
||||||
Whether to also include instance segmentation masks as part of the labels in case `format =
|
|
||||||
"coco_detection"`.
|
|
||||||
|
|
||||||
masks_path (`pathlib.Path`, *optional*):
|
|
||||||
Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
|
|
||||||
relevant in case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
|
|
||||||
|
|
||||||
padding (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to pad images up to the largest image in a batch.
|
|
||||||
|
|
||||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
||||||
If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
|
|
||||||
objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
- **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
|
|
||||||
"""
|
|
||||||
# Input type checking for clearer error
|
|
||||||
|
|
||||||
valid_images = False
|
|
||||||
valid_annotations = False
|
|
||||||
valid_masks_path = False
|
|
||||||
|
|
||||||
# Check that images has a valid type
|
|
||||||
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
|
|
||||||
valid_images = True
|
|
||||||
elif isinstance(images, (list, tuple)):
|
|
||||||
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
|
|
||||||
valid_images = True
|
|
||||||
|
|
||||||
if not valid_images:
|
|
||||||
raise ValueError(
|
|
||||||
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
|
|
||||||
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
|
|
||||||
)
|
|
||||||
|
|
||||||
is_batched = bool(
|
|
||||||
isinstance(images, (list, tuple))
|
|
||||||
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check that annotations has a valid type
|
|
||||||
if annotations is not None:
|
|
||||||
if not is_batched:
|
|
||||||
if self.format == "coco_detection":
|
|
||||||
if isinstance(annotations, dict) and "image_id" in annotations and "annotations" in annotations:
|
|
||||||
if isinstance(annotations["annotations"], (list, tuple)):
|
|
||||||
# an image can have no annotations
|
|
||||||
if len(annotations["annotations"]) == 0 or isinstance(annotations["annotations"][0], dict):
|
|
||||||
valid_annotations = True
|
|
||||||
elif self.format == "coco_panoptic":
|
|
||||||
if isinstance(annotations, dict) and "image_id" in annotations and "segments_info" in annotations:
|
|
||||||
if isinstance(annotations["segments_info"], (list, tuple)):
|
|
||||||
# an image can have no segments (?)
|
|
||||||
if len(annotations["segments_info"]) == 0 or isinstance(
|
|
||||||
annotations["segments_info"][0], dict
|
|
||||||
):
|
|
||||||
valid_annotations = True
|
|
||||||
else:
|
|
||||||
if isinstance(annotations, (list, tuple)):
|
|
||||||
if len(images) != len(annotations):
|
|
||||||
raise ValueError("There must be as many annotations as there are images")
|
|
||||||
if isinstance(annotations[0], Dict):
|
|
||||||
if self.format == "coco_detection":
|
|
||||||
if isinstance(annotations[0]["annotations"], (list, tuple)):
|
|
||||||
valid_annotations = True
|
|
||||||
elif self.format == "coco_panoptic":
|
|
||||||
if isinstance(annotations[0]["segments_info"], (list, tuple)):
|
|
||||||
valid_annotations = True
|
|
||||||
|
|
||||||
if not valid_annotations:
|
|
||||||
raise ValueError(
|
|
||||||
"""
|
|
||||||
Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
|
|
||||||
detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
|
|
||||||
being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
|
|
||||||
should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
|
|
||||||
of annotations in COCO format.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check that masks_path has a valid type
|
|
||||||
if masks_path is not None:
|
|
||||||
if self.format == "coco_panoptic":
|
|
||||||
if isinstance(masks_path, pathlib.Path):
|
|
||||||
valid_masks_path = True
|
|
||||||
if not valid_masks_path:
|
|
||||||
raise ValueError(
|
|
||||||
"The path to the directory containing the mask PNG files should be provided as a"
|
|
||||||
" `pathlib.Path` object."
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_batched:
|
|
||||||
images = [images]
|
|
||||||
if annotations is not None:
|
|
||||||
annotations = [annotations]
|
|
||||||
|
|
||||||
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
|
|
||||||
if annotations is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, annotations)):
|
|
||||||
if not isinstance(image, Image.Image):
|
|
||||||
image = self.to_pil_image(image)
|
|
||||||
image, target = self.prepare(image, target, return_segmentation_masks, masks_path)
|
|
||||||
images[idx] = image
|
|
||||||
annotations[idx] = target
|
|
||||||
|
|
||||||
# transformations (resizing + normalization)
|
|
||||||
if self.do_resize and self.size is not None:
|
|
||||||
if annotations is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, annotations)):
|
|
||||||
image, target = self._resize(image=image, target=target, size=self.size, max_size=self.max_size)
|
|
||||||
images[idx] = image
|
|
||||||
annotations[idx] = target
|
|
||||||
else:
|
|
||||||
for idx, image in enumerate(images):
|
|
||||||
images[idx] = self._resize(image=image, target=None, size=self.size, max_size=self.max_size)[0]
|
|
||||||
|
|
||||||
if self.do_normalize:
|
|
||||||
if annotations is not None:
|
|
||||||
for idx, (image, target) in enumerate(zip(images, annotations)):
|
|
||||||
image, target = self._normalize(
|
|
||||||
image=image, mean=self.image_mean, std=self.image_std, target=target
|
|
||||||
)
|
|
||||||
images[idx] = image
|
|
||||||
annotations[idx] = target
|
|
||||||
else:
|
|
||||||
images = [
|
|
||||||
self._normalize(image=image, mean=self.image_mean, std=self.image_std)[0] for image in images
|
|
||||||
]
|
|
||||||
|
|
||||||
if padding:
|
|
||||||
# pad images up to largest image in batch
|
|
||||||
max_size = self._max_by_axis([list(image.shape) for image in images])
|
|
||||||
c, h, w = max_size
|
|
||||||
padded_images = []
|
|
||||||
for image in images:
|
|
||||||
# create padded image
|
|
||||||
padded_image = np.zeros((c, h, w), dtype=np.float32)
|
|
||||||
padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
|
|
||||||
padded_images.append(padded_image)
|
|
||||||
images = padded_images
|
|
||||||
|
|
||||||
# return as BatchFeature
|
|
||||||
data = {}
|
|
||||||
data["pixel_values"] = images
|
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|
||||||
if annotations is not None:
|
|
||||||
# Convert to TensorType
|
|
||||||
tensor_type = return_tensors
|
|
||||||
if not isinstance(tensor_type, TensorType):
|
|
||||||
tensor_type = TensorType(tensor_type)
|
|
||||||
|
|
||||||
if not tensor_type == TensorType.PYTORCH:
|
|
||||||
raise ValueError("Only PyTorch is supported for the moment.")
|
|
||||||
else:
|
|
||||||
if not is_torch_available():
|
|
||||||
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
|
|
||||||
|
|
||||||
encoded_inputs["labels"] = [
|
|
||||||
{k: torch.from_numpy(v) for k, v in target.items()} for target in annotations
|
|
||||||
]
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._max_by_axis
|
|
||||||
def _max_by_axis(self, the_list):
|
|
||||||
# type: (List[List[int]]) -> List[int]
|
|
||||||
maxes = the_list[0]
|
|
||||||
for sublist in the_list[1:]:
|
|
||||||
for index, item in enumerate(sublist):
|
|
||||||
maxes[index] = max(maxes[index], item)
|
|
||||||
return maxes
|
|
||||||
|
|
||||||
def pad(self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None):
|
|
||||||
"""
|
|
||||||
Pad images up to the largest image in a batch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pixel_values_list (`List[torch.Tensor]`):
|
|
||||||
List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
|
|
||||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
||||||
If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
|
|
||||||
objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[`BatchFeature`]: A [`BatchFeature`] with the following field:
|
|
||||||
|
|
||||||
- **pixel_values** -- Pixel values to be fed to a model.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
|
|
||||||
c, h, w = max_size
|
|
||||||
padded_images = []
|
|
||||||
for image in pixel_values_list:
|
|
||||||
# create padded image
|
|
||||||
padded_image = np.zeros((c, h, w), dtype=np.float32)
|
|
||||||
padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
|
|
||||||
padded_images.append(padded_image)
|
|
||||||
|
|
||||||
# return as BatchFeature
|
|
||||||
data = {"pixel_values": padded_images}
|
|
||||||
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|
||||||
return encoded_inputs
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process
|
|
||||||
def post_process(self, outputs, target_sizes):
|
|
||||||
"""
|
|
||||||
Converts the output of [`DetrForObjectDetection`] into the format expected by the COCO api. Only supports
|
|
||||||
PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`DetrObjectDetectionOutput`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
|
|
||||||
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
|
|
||||||
original image size (before any data augmentation). For visualization, this should be the image size
|
|
||||||
after data augment, but before padding.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
|
||||||
in the batch as predicted by the model.
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
|
|
||||||
" `post_process_object_detection`",
|
|
||||||
FutureWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
|
||||||
|
|
||||||
if len(out_logits) != len(target_sizes):
|
|
||||||
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
|
||||||
if target_sizes.shape[1] != 2:
|
|
||||||
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
|
|
||||||
|
|
||||||
prob = nn.functional.softmax(out_logits, -1)
|
|
||||||
scores, labels = prob[..., :-1].max(-1)
|
|
||||||
|
|
||||||
# convert to [x0, y0, x1, y1] format
|
|
||||||
boxes = center_to_corners_format(out_bbox)
|
|
||||||
# and from relative [0, 1] to absolute [0, height] coordinates
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
|
|
||||||
return results
|
|
||||||
|
|
||||||
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_object_detection with Detr->Yolos
|
|
||||||
def post_process_object_detection(
|
|
||||||
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Converts the output of [`YolosForObjectDetection`] into the format expected by the COCO api. Only supports
|
|
||||||
PyTorch.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
outputs ([`YolosObjectDetectionOutput`]):
|
|
||||||
Raw outputs of the model.
|
|
||||||
threshold (`float`, *optional*):
|
|
||||||
Score threshold to keep object detection predictions.
|
|
||||||
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*, defaults to `None`):
|
|
||||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
|
||||||
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
|
|
||||||
in the batch as predicted by the model.
|
|
||||||
"""
|
|
||||||
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
|
|
||||||
|
|
||||||
if target_sizes is not None:
|
|
||||||
if len(out_logits) != len(target_sizes):
|
|
||||||
raise ValueError(
|
|
||||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
|
||||||
)
|
|
||||||
|
|
||||||
prob = nn.functional.softmax(out_logits, -1)
|
|
||||||
scores, labels = prob[..., :-1].max(-1)
|
|
||||||
|
|
||||||
# Convert to [x0, y0, x1, y1] format
|
|
||||||
boxes = center_to_corners_format(out_bbox)
|
|
||||||
|
|
||||||
# Convert from relative [0, 1] to absolute [0, height] coordinates
|
|
||||||
if target_sizes is not None:
|
|
||||||
if isinstance(target_sizes, List):
|
|
||||||
img_h = torch.Tensor([i[0] for i in target_sizes])
|
|
||||||
img_w = torch.Tensor([i[1] for i in target_sizes])
|
|
||||||
else:
|
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
|
||||||
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for s, l, b in zip(scores, labels, boxes):
|
|
||||||
score = s[s > threshold]
|
|
||||||
label = l[s > threshold]
|
|
||||||
box = b[s > threshold]
|
|
||||||
results.append({"scores": score, "labels": label, "boxes": box})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|||||||
1214
src/transformers/models/yolos/image_processing_yolos.py
Normal file
1214
src/transformers/models/yolos/image_processing_yolos.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -64,6 +64,13 @@ class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class ConditionalDetrImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class ConvNextFeatureExtractor(metaclass=DummyObject):
|
class ConvNextFeatureExtractor(metaclass=DummyObject):
|
||||||
_backends = ["vision"]
|
_backends = ["vision"]
|
||||||
|
|
||||||
@@ -85,6 +92,13 @@ class DeformableDetrFeatureExtractor(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class DeformableDetrImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class DeiTFeatureExtractor(metaclass=DummyObject):
|
class DeiTFeatureExtractor(metaclass=DummyObject):
|
||||||
_backends = ["vision"]
|
_backends = ["vision"]
|
||||||
|
|
||||||
@@ -106,6 +120,13 @@ class DetrFeatureExtractor(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class DetrImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class DonutFeatureExtractor(metaclass=DummyObject):
|
class DonutFeatureExtractor(metaclass=DummyObject):
|
||||||
_backends = ["vision"]
|
_backends = ["vision"]
|
||||||
|
|
||||||
@@ -232,6 +253,13 @@ class MaskFormerFeatureExtractor(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class MaskFormerImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class MobileNetV1FeatureExtractor(metaclass=DummyObject):
|
class MobileNetV1FeatureExtractor(metaclass=DummyObject):
|
||||||
_backends = ["vision"]
|
_backends = ["vision"]
|
||||||
|
|
||||||
@@ -281,6 +309,13 @@ class OwlViTFeatureExtractor(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class OwlViTImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
class PerceiverFeatureExtractor(metaclass=DummyObject):
|
class PerceiverFeatureExtractor(metaclass=DummyObject):
|
||||||
_backends = ["vision"]
|
_backends = ["vision"]
|
||||||
|
|
||||||
@@ -377,3 +412,10 @@ class YolosFeatureExtractor(metaclass=DummyObject):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
requires_backends(self, ["vision"])
|
requires_backends(self, ["vision"])
|
||||||
|
|
||||||
|
|
||||||
|
class YolosImageProcessor(metaclass=DummyObject):
|
||||||
|
_backends = ["vision"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["vision"])
|
||||||
|
|||||||
@@ -44,12 +44,16 @@ class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
min_resolution=30,
|
min_resolution=30,
|
||||||
max_resolution=400,
|
max_resolution=400,
|
||||||
do_resize=True,
|
do_resize=True,
|
||||||
size=18,
|
size=None,
|
||||||
max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p
|
|
||||||
do_normalize=True,
|
do_normalize=True,
|
||||||
image_mean=[0.5, 0.5, 0.5],
|
image_mean=[0.5, 0.5, 0.5],
|
||||||
image_std=[0.5, 0.5, 0.5],
|
image_std=[0.5, 0.5, 0.5],
|
||||||
|
do_rescale=True,
|
||||||
|
rescale_factor=1 / 255,
|
||||||
|
do_pad=True,
|
||||||
):
|
):
|
||||||
|
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
|
||||||
|
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
@@ -57,19 +61,23 @@ class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
self.size = size
|
self.size = size
|
||||||
self.max_size = max_size
|
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
self.image_mean = image_mean
|
self.image_mean = image_mean
|
||||||
self.image_std = image_std
|
self.image_std = image_std
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_pad = do_pad
|
||||||
|
|
||||||
def prepare_feat_extract_dict(self):
|
def prepare_feat_extract_dict(self):
|
||||||
return {
|
return {
|
||||||
"do_resize": self.do_resize,
|
"do_resize": self.do_resize,
|
||||||
"size": self.size,
|
"size": self.size,
|
||||||
"max_size": self.max_size,
|
|
||||||
"do_normalize": self.do_normalize,
|
"do_normalize": self.do_normalize,
|
||||||
"image_mean": self.image_mean,
|
"image_mean": self.image_mean,
|
||||||
"image_std": self.image_std,
|
"image_std": self.image_std,
|
||||||
|
"do_rescale": self.do_rescale,
|
||||||
|
"rescale_factor": self.rescale_factor,
|
||||||
|
"do_pad": self.do_pad,
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_expected_values(self, image_inputs, batched=False):
|
def get_expected_values(self, image_inputs, batched=False):
|
||||||
@@ -84,14 +92,14 @@ class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
else:
|
else:
|
||||||
h, w = image.shape[1], image.shape[2]
|
h, w = image.shape[1], image.shape[2]
|
||||||
if w < h:
|
if w < h:
|
||||||
expected_height = int(self.size * h / w)
|
expected_height = int(self.size["shortest_edge"] * h / w)
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
elif w > h:
|
elif w > h:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = int(self.size * w / h)
|
expected_width = int(self.size["shortest_edge"] * w / h)
|
||||||
else:
|
else:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
expected_values = []
|
expected_values = []
|
||||||
@@ -124,7 +132,6 @@ class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, uni
|
|||||||
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "size"))
|
self.assertTrue(hasattr(feature_extractor, "size"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "max_size"))
|
|
||||||
|
|
||||||
def test_batch_feature(self):
|
def test_batch_feature(self):
|
||||||
pass
|
pass
|
||||||
@@ -230,7 +237,7 @@ class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, uni
|
|||||||
def test_equivalence_pad_and_create_pixel_mask(self):
|
def test_equivalence_pad_and_create_pixel_mask(self):
|
||||||
# Initialize feature_extractors
|
# Initialize feature_extractors
|
||||||
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
|
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
|
||||||
# create random PyTorch tensors
|
# create random PyTorch tensors
|
||||||
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
||||||
for image in image_inputs:
|
for image in image_inputs:
|
||||||
@@ -331,7 +338,7 @@ class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, uni
|
|||||||
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
||||||
# verify masks
|
# verify masks
|
||||||
expected_masks_sum = 822338
|
expected_masks_sum = 822873
|
||||||
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
||||||
# verify orig_size
|
# verify orig_size
|
||||||
expected_orig_size = torch.tensor([480, 640])
|
expected_orig_size = torch.tensor([480, 640])
|
||||||
|
|||||||
@@ -44,12 +44,16 @@ class DeformableDetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
min_resolution=30,
|
min_resolution=30,
|
||||||
max_resolution=400,
|
max_resolution=400,
|
||||||
do_resize=True,
|
do_resize=True,
|
||||||
size=18,
|
size=None,
|
||||||
max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p
|
|
||||||
do_normalize=True,
|
do_normalize=True,
|
||||||
image_mean=[0.5, 0.5, 0.5],
|
image_mean=[0.5, 0.5, 0.5],
|
||||||
image_std=[0.5, 0.5, 0.5],
|
image_std=[0.5, 0.5, 0.5],
|
||||||
|
do_rescale=True,
|
||||||
|
rescale_factor=1 / 255,
|
||||||
|
do_pad=True,
|
||||||
):
|
):
|
||||||
|
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
|
||||||
|
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
@@ -57,19 +61,23 @@ class DeformableDetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
self.size = size
|
self.size = size
|
||||||
self.max_size = max_size
|
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
self.image_mean = image_mean
|
self.image_mean = image_mean
|
||||||
self.image_std = image_std
|
self.image_std = image_std
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_pad = do_pad
|
||||||
|
|
||||||
def prepare_feat_extract_dict(self):
|
def prepare_feat_extract_dict(self):
|
||||||
return {
|
return {
|
||||||
"do_resize": self.do_resize,
|
"do_resize": self.do_resize,
|
||||||
"size": self.size,
|
"size": self.size,
|
||||||
"max_size": self.max_size,
|
|
||||||
"do_normalize": self.do_normalize,
|
"do_normalize": self.do_normalize,
|
||||||
"image_mean": self.image_mean,
|
"image_mean": self.image_mean,
|
||||||
"image_std": self.image_std,
|
"image_std": self.image_std,
|
||||||
|
"do_rescale": self.do_rescale,
|
||||||
|
"rescale_factor": self.rescale_factor,
|
||||||
|
"do_pad": self.do_pad,
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_expected_values(self, image_inputs, batched=False):
|
def get_expected_values(self, image_inputs, batched=False):
|
||||||
@@ -84,14 +92,14 @@ class DeformableDetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
else:
|
else:
|
||||||
h, w = image.shape[1], image.shape[2]
|
h, w = image.shape[1], image.shape[2]
|
||||||
if w < h:
|
if w < h:
|
||||||
expected_height = int(self.size * h / w)
|
expected_height = int(self.size["shortest_edge"] * h / w)
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
elif w > h:
|
elif w > h:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = int(self.size * w / h)
|
expected_width = int(self.size["shortest_edge"] * w / h)
|
||||||
else:
|
else:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
expected_values = []
|
expected_values = []
|
||||||
@@ -123,8 +131,9 @@ class DeformableDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unit
|
|||||||
self.assertTrue(hasattr(feature_extractor, "image_std"))
|
self.assertTrue(hasattr(feature_extractor, "image_std"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
||||||
|
self.assertTrue(hasattr(feature_extractor, "do_rescale"))
|
||||||
|
self.assertTrue(hasattr(feature_extractor, "do_pad"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "size"))
|
self.assertTrue(hasattr(feature_extractor, "size"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "max_size"))
|
|
||||||
|
|
||||||
def test_batch_feature(self):
|
def test_batch_feature(self):
|
||||||
pass
|
pass
|
||||||
@@ -230,7 +239,8 @@ class DeformableDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unit
|
|||||||
def test_equivalence_pad_and_create_pixel_mask(self):
|
def test_equivalence_pad_and_create_pixel_mask(self):
|
||||||
# Initialize feature_extractors
|
# Initialize feature_extractors
|
||||||
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
|
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
|
||||||
|
|
||||||
# create random PyTorch tensors
|
# create random PyTorch tensors
|
||||||
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
||||||
for image in image_inputs:
|
for image in image_inputs:
|
||||||
@@ -331,7 +341,7 @@ class DeformableDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unit
|
|||||||
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
||||||
# verify masks
|
# verify masks
|
||||||
expected_masks_sum = 822338
|
expected_masks_sum = 822873
|
||||||
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
||||||
# verify orig_size
|
# verify orig_size
|
||||||
expected_orig_size = torch.tensor([480, 640])
|
expected_orig_size = torch.tensor([480, 640])
|
||||||
|
|||||||
@@ -44,12 +44,16 @@ class DetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
min_resolution=30,
|
min_resolution=30,
|
||||||
max_resolution=400,
|
max_resolution=400,
|
||||||
do_resize=True,
|
do_resize=True,
|
||||||
size=18,
|
size=None,
|
||||||
max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p
|
do_rescale=True,
|
||||||
|
rescale_factor=1 / 255,
|
||||||
do_normalize=True,
|
do_normalize=True,
|
||||||
image_mean=[0.5, 0.5, 0.5],
|
image_mean=[0.5, 0.5, 0.5],
|
||||||
image_std=[0.5, 0.5, 0.5],
|
image_std=[0.5, 0.5, 0.5],
|
||||||
|
do_pad=True,
|
||||||
):
|
):
|
||||||
|
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
|
||||||
|
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
@@ -57,19 +61,23 @@ class DetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
self.size = size
|
self.size = size
|
||||||
self.max_size = max_size
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
self.image_mean = image_mean
|
self.image_mean = image_mean
|
||||||
self.image_std = image_std
|
self.image_std = image_std
|
||||||
|
self.do_pad = do_pad
|
||||||
|
|
||||||
def prepare_feat_extract_dict(self):
|
def prepare_feat_extract_dict(self):
|
||||||
return {
|
return {
|
||||||
"do_resize": self.do_resize,
|
"do_resize": self.do_resize,
|
||||||
"size": self.size,
|
"size": self.size,
|
||||||
"max_size": self.max_size,
|
"do_rescale": self.do_rescale,
|
||||||
|
"rescale_factor": self.rescale_factor,
|
||||||
"do_normalize": self.do_normalize,
|
"do_normalize": self.do_normalize,
|
||||||
"image_mean": self.image_mean,
|
"image_mean": self.image_mean,
|
||||||
"image_std": self.image_std,
|
"image_std": self.image_std,
|
||||||
|
"do_pad": self.do_pad,
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_expected_values(self, image_inputs, batched=False):
|
def get_expected_values(self, image_inputs, batched=False):
|
||||||
@@ -84,14 +92,14 @@ class DetrFeatureExtractionTester(unittest.TestCase):
|
|||||||
else:
|
else:
|
||||||
h, w = image.shape[1], image.shape[2]
|
h, w = image.shape[1], image.shape[2]
|
||||||
if w < h:
|
if w < h:
|
||||||
expected_height = int(self.size * h / w)
|
expected_height = int(self.size["shortest_edge"] * h / w)
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
elif w > h:
|
elif w > h:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = int(self.size * w / h)
|
expected_width = int(self.size["shortest_edge"] * w / h)
|
||||||
else:
|
else:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
expected_values = []
|
expected_values = []
|
||||||
@@ -122,9 +130,11 @@ class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
|
|||||||
self.assertTrue(hasattr(feature_extractor, "image_mean"))
|
self.assertTrue(hasattr(feature_extractor, "image_mean"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "image_std"))
|
self.assertTrue(hasattr(feature_extractor, "image_std"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
||||||
|
self.assertTrue(hasattr(feature_extractor, "do_rescale"))
|
||||||
|
self.assertTrue(hasattr(feature_extractor, "rescale_factor"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "size"))
|
self.assertTrue(hasattr(feature_extractor, "size"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "max_size"))
|
self.assertTrue(hasattr(feature_extractor, "do_pad"))
|
||||||
|
|
||||||
def test_batch_feature(self):
|
def test_batch_feature(self):
|
||||||
pass
|
pass
|
||||||
@@ -230,7 +240,7 @@ class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
|
|||||||
def test_equivalence_pad_and_create_pixel_mask(self):
|
def test_equivalence_pad_and_create_pixel_mask(self):
|
||||||
# Initialize feature_extractors
|
# Initialize feature_extractors
|
||||||
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
|
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
|
||||||
# create random PyTorch tensors
|
# create random PyTorch tensors
|
||||||
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
||||||
for image in image_inputs:
|
for image in image_inputs:
|
||||||
@@ -331,7 +341,7 @@ class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
|
|||||||
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
||||||
# verify masks
|
# verify masks
|
||||||
expected_masks_sum = 822338
|
expected_masks_sum = 822873
|
||||||
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
||||||
# verify orig_size
|
# verify orig_size
|
||||||
expected_orig_size = torch.tensor([480, 640])
|
expected_orig_size = torch.tensor([480, 640])
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ if is_torch_available():
|
|||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from transformers import MaskFormerFeatureExtractor
|
from transformers import MaskFormerFeatureExtractor
|
||||||
from transformers.models.maskformer.feature_extraction_maskformer import binary_mask_to_rle
|
from transformers.models.maskformer.image_processing_maskformer import binary_mask_to_rle
|
||||||
from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput
|
from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
@@ -46,9 +46,8 @@ class MaskFormerFeatureExtractionTester(unittest.TestCase):
|
|||||||
num_channels=3,
|
num_channels=3,
|
||||||
min_resolution=30,
|
min_resolution=30,
|
||||||
max_resolution=400,
|
max_resolution=400,
|
||||||
|
size=None,
|
||||||
do_resize=True,
|
do_resize=True,
|
||||||
size=32,
|
|
||||||
max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p
|
|
||||||
do_normalize=True,
|
do_normalize=True,
|
||||||
image_mean=[0.5, 0.5, 0.5],
|
image_mean=[0.5, 0.5, 0.5],
|
||||||
image_std=[0.5, 0.5, 0.5],
|
image_std=[0.5, 0.5, 0.5],
|
||||||
@@ -62,12 +61,11 @@ class MaskFormerFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.min_resolution = min_resolution
|
self.min_resolution = min_resolution
|
||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
self.size = size
|
self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
|
||||||
self.max_size = max_size
|
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
self.image_mean = image_mean
|
self.image_mean = image_mean
|
||||||
self.image_std = image_std
|
self.image_std = image_std
|
||||||
self.size_divisibility = 0
|
self.size_divisor = 0
|
||||||
# for the post_process_functions
|
# for the post_process_functions
|
||||||
self.batch_size = 2
|
self.batch_size = 2
|
||||||
self.num_queries = 3
|
self.num_queries = 3
|
||||||
@@ -82,11 +80,10 @@ class MaskFormerFeatureExtractionTester(unittest.TestCase):
|
|||||||
return {
|
return {
|
||||||
"do_resize": self.do_resize,
|
"do_resize": self.do_resize,
|
||||||
"size": self.size,
|
"size": self.size,
|
||||||
"max_size": self.max_size,
|
|
||||||
"do_normalize": self.do_normalize,
|
"do_normalize": self.do_normalize,
|
||||||
"image_mean": self.image_mean,
|
"image_mean": self.image_mean,
|
||||||
"image_std": self.image_std,
|
"image_std": self.image_std,
|
||||||
"size_divisibility": self.size_divisibility,
|
"size_divisor": self.size_divisor,
|
||||||
"num_labels": self.num_labels,
|
"num_labels": self.num_labels,
|
||||||
"reduce_labels": self.reduce_labels,
|
"reduce_labels": self.reduce_labels,
|
||||||
"ignore_index": self.ignore_index,
|
"ignore_index": self.ignore_index,
|
||||||
@@ -104,14 +101,14 @@ class MaskFormerFeatureExtractionTester(unittest.TestCase):
|
|||||||
else:
|
else:
|
||||||
h, w = image.shape[1], image.shape[2]
|
h, w = image.shape[1], image.shape[2]
|
||||||
if w < h:
|
if w < h:
|
||||||
expected_height = int(self.size * h / w)
|
expected_height = int(self.size["shortest_edge"] * h / w)
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
elif w > h:
|
elif w > h:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = int(self.size * w / h)
|
expected_width = int(self.size["shortest_edge"] * w / h)
|
||||||
else:
|
else:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
expected_values = []
|
expected_values = []
|
||||||
@@ -260,7 +257,7 @@ class MaskFormerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest
|
|||||||
# Initialize feature_extractors
|
# Initialize feature_extractors
|
||||||
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
feature_extractor_2 = self.feature_extraction_class(
|
feature_extractor_2 = self.feature_extraction_class(
|
||||||
do_resize=False, do_normalize=False, num_labels=self.feature_extract_tester.num_classes
|
do_resize=False, do_normalize=False, do_rescale=False, num_labels=self.feature_extract_tester.num_classes
|
||||||
)
|
)
|
||||||
# create random PyTorch tensors
|
# create random PyTorch tensors
|
||||||
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
||||||
@@ -283,23 +280,23 @@ class MaskFormerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest
|
|||||||
):
|
):
|
||||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
|
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
# prepare image and target
|
# prepare image and target
|
||||||
batch_size = self.feature_extract_tester.batch_size
|
|
||||||
num_labels = self.feature_extract_tester.num_labels
|
num_labels = self.feature_extract_tester.num_labels
|
||||||
annotations = None
|
annotations = None
|
||||||
instance_id_to_semantic_id = None
|
instance_id_to_semantic_id = None
|
||||||
|
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
|
||||||
if with_segmentation_maps:
|
if with_segmentation_maps:
|
||||||
high = num_labels
|
high = num_labels
|
||||||
if is_instance_map:
|
if is_instance_map:
|
||||||
high * 2
|
|
||||||
labels_expanded = list(range(num_labels)) * 2
|
labels_expanded = list(range(num_labels)) * 2
|
||||||
instance_id_to_semantic_id = {
|
instance_id_to_semantic_id = {
|
||||||
instance_id: label_id for instance_id, label_id in enumerate(labels_expanded)
|
instance_id: label_id for instance_id, label_id in enumerate(labels_expanded)
|
||||||
}
|
}
|
||||||
annotations = [np.random.randint(0, high, (384, 384)).astype(np.uint8) for _ in range(batch_size)]
|
annotations = [
|
||||||
|
np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
|
||||||
|
]
|
||||||
if segmentation_type == "pil":
|
if segmentation_type == "pil":
|
||||||
annotations = [Image.fromarray(annotation) for annotation in annotations]
|
annotations = [Image.fromarray(annotation) for annotation in annotations]
|
||||||
|
|
||||||
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
|
|
||||||
inputs = feature_extractor(
|
inputs = feature_extractor(
|
||||||
image_inputs,
|
image_inputs,
|
||||||
annotations,
|
annotations,
|
||||||
@@ -313,18 +310,18 @@ class MaskFormerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest
|
|||||||
def test_init_without_params(self):
|
def test_init_without_params(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def test_with_size_divisibility(self):
|
def test_with_size_divisor(self):
|
||||||
size_divisibilities = [8, 16, 32]
|
size_divisors = [8, 16, 32]
|
||||||
weird_input_sizes = [(407, 802), (582, 1094)]
|
weird_input_sizes = [(407, 802), (582, 1094)]
|
||||||
for size_divisibility in size_divisibilities:
|
for size_divisor in size_divisors:
|
||||||
feat_extract_dict = {**self.feat_extract_dict, **{"size_divisibility": size_divisibility}}
|
feat_extract_dict = {**self.feat_extract_dict, **{"size_divisor": size_divisor}}
|
||||||
feature_extractor = self.feature_extraction_class(**feat_extract_dict)
|
feature_extractor = self.feature_extraction_class(**feat_extract_dict)
|
||||||
for weird_input_size in weird_input_sizes:
|
for weird_input_size in weird_input_sizes:
|
||||||
inputs = feature_extractor([np.ones((3, *weird_input_size))], return_tensors="pt")
|
inputs = feature_extractor([np.ones((3, *weird_input_size))], return_tensors="pt")
|
||||||
pixel_values = inputs["pixel_values"]
|
pixel_values = inputs["pixel_values"]
|
||||||
# check if divisible
|
# check if divisible
|
||||||
self.assertTrue((pixel_values.shape[-1] % size_divisibility) == 0)
|
self.assertTrue((pixel_values.shape[-1] % size_divisor) == 0)
|
||||||
self.assertTrue((pixel_values.shape[-2] % size_divisibility) == 0)
|
self.assertTrue((pixel_values.shape[-2] % size_divisor) == 0)
|
||||||
|
|
||||||
def test_call_with_segmentation_maps(self):
|
def test_call_with_segmentation_maps(self):
|
||||||
def common(is_instance_map=False, segmentation_type=None):
|
def common(is_instance_map=False, segmentation_type=None):
|
||||||
|
|||||||
@@ -43,9 +43,9 @@ class OwlViTFeatureExtractionTester(unittest.TestCase):
|
|||||||
min_resolution=30,
|
min_resolution=30,
|
||||||
max_resolution=400,
|
max_resolution=400,
|
||||||
do_resize=True,
|
do_resize=True,
|
||||||
size=20,
|
size=None,
|
||||||
do_center_crop=True,
|
do_center_crop=True,
|
||||||
crop_size=18,
|
crop_size=None,
|
||||||
do_normalize=True,
|
do_normalize=True,
|
||||||
image_mean=[0.48145466, 0.4578275, 0.40821073],
|
image_mean=[0.48145466, 0.4578275, 0.40821073],
|
||||||
image_std=[0.26862954, 0.26130258, 0.27577711],
|
image_std=[0.26862954, 0.26130258, 0.27577711],
|
||||||
@@ -58,9 +58,9 @@ class OwlViTFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.min_resolution = min_resolution
|
self.min_resolution = min_resolution
|
||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
self.size = size
|
self.size = size if size is not None else {"height": 18, "width": 18}
|
||||||
self.do_center_crop = do_center_crop
|
self.do_center_crop = do_center_crop
|
||||||
self.crop_size = crop_size
|
self.crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
self.image_mean = image_mean
|
self.image_mean = image_mean
|
||||||
self.image_std = image_std
|
self.image_std = image_std
|
||||||
@@ -119,8 +119,8 @@ class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Tes
|
|||||||
(
|
(
|
||||||
1,
|
1,
|
||||||
self.feature_extract_tester.num_channels,
|
self.feature_extract_tester.num_channels,
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["height"],
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["width"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -131,8 +131,8 @@ class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Tes
|
|||||||
(
|
(
|
||||||
self.feature_extract_tester.batch_size,
|
self.feature_extract_tester.batch_size,
|
||||||
self.feature_extract_tester.num_channels,
|
self.feature_extract_tester.num_channels,
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["height"],
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["width"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -151,8 +151,8 @@ class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Tes
|
|||||||
(
|
(
|
||||||
1,
|
1,
|
||||||
self.feature_extract_tester.num_channels,
|
self.feature_extract_tester.num_channels,
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["height"],
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["width"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -163,8 +163,8 @@ class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Tes
|
|||||||
(
|
(
|
||||||
self.feature_extract_tester.batch_size,
|
self.feature_extract_tester.batch_size,
|
||||||
self.feature_extract_tester.num_channels,
|
self.feature_extract_tester.num_channels,
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["height"],
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["width"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -183,8 +183,8 @@ class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Tes
|
|||||||
(
|
(
|
||||||
1,
|
1,
|
||||||
self.feature_extract_tester.num_channels,
|
self.feature_extract_tester.num_channels,
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["height"],
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["width"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -195,7 +195,7 @@ class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Tes
|
|||||||
(
|
(
|
||||||
self.feature_extract_tester.batch_size,
|
self.feature_extract_tester.batch_size,
|
||||||
self.feature_extract_tester.num_channels,
|
self.feature_extract_tester.num_channels,
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["height"],
|
||||||
self.feature_extract_tester.crop_size,
|
self.feature_extract_tester.crop_size["width"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -44,12 +44,16 @@ class YolosFeatureExtractionTester(unittest.TestCase):
|
|||||||
min_resolution=30,
|
min_resolution=30,
|
||||||
max_resolution=400,
|
max_resolution=400,
|
||||||
do_resize=True,
|
do_resize=True,
|
||||||
size=18,
|
size=None,
|
||||||
max_size=1333, # by setting max_size > max_resolution we're effectively not testing this :p
|
|
||||||
do_normalize=True,
|
do_normalize=True,
|
||||||
image_mean=[0.5, 0.5, 0.5],
|
image_mean=[0.5, 0.5, 0.5],
|
||||||
image_std=[0.5, 0.5, 0.5],
|
image_std=[0.5, 0.5, 0.5],
|
||||||
|
do_rescale=True,
|
||||||
|
rescale_factor=1 / 255,
|
||||||
|
do_pad=True,
|
||||||
):
|
):
|
||||||
|
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
|
||||||
|
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
@@ -57,19 +61,23 @@ class YolosFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.do_resize = do_resize
|
self.do_resize = do_resize
|
||||||
self.size = size
|
self.size = size
|
||||||
self.max_size = max_size
|
|
||||||
self.do_normalize = do_normalize
|
self.do_normalize = do_normalize
|
||||||
self.image_mean = image_mean
|
self.image_mean = image_mean
|
||||||
self.image_std = image_std
|
self.image_std = image_std
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_pad = do_pad
|
||||||
|
|
||||||
def prepare_feat_extract_dict(self):
|
def prepare_feat_extract_dict(self):
|
||||||
return {
|
return {
|
||||||
"do_resize": self.do_resize,
|
"do_resize": self.do_resize,
|
||||||
"size": self.size,
|
"size": self.size,
|
||||||
"max_size": self.max_size,
|
|
||||||
"do_normalize": self.do_normalize,
|
"do_normalize": self.do_normalize,
|
||||||
"image_mean": self.image_mean,
|
"image_mean": self.image_mean,
|
||||||
"image_std": self.image_std,
|
"image_std": self.image_std,
|
||||||
|
"do_rescale": self.do_rescale,
|
||||||
|
"rescale_factor": self.rescale_factor,
|
||||||
|
"do_pad": self.do_pad,
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_expected_values(self, image_inputs, batched=False):
|
def get_expected_values(self, image_inputs, batched=False):
|
||||||
@@ -84,14 +92,14 @@ class YolosFeatureExtractionTester(unittest.TestCase):
|
|||||||
else:
|
else:
|
||||||
h, w = image.shape[1], image.shape[2]
|
h, w = image.shape[1], image.shape[2]
|
||||||
if w < h:
|
if w < h:
|
||||||
expected_height = int(self.size * h / w)
|
expected_height = int(self.size["shortest_edge"] * h / w)
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
elif w > h:
|
elif w > h:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = int(self.size * w / h)
|
expected_width = int(self.size["shortest_edge"] * w / h)
|
||||||
else:
|
else:
|
||||||
expected_height = self.size
|
expected_height = self.size["shortest_edge"]
|
||||||
expected_width = self.size
|
expected_width = self.size["shortest_edge"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
expected_values = []
|
expected_values = []
|
||||||
@@ -124,7 +132,6 @@ class YolosFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Test
|
|||||||
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
self.assertTrue(hasattr(feature_extractor, "do_normalize"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
self.assertTrue(hasattr(feature_extractor, "do_resize"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "size"))
|
self.assertTrue(hasattr(feature_extractor, "size"))
|
||||||
self.assertTrue(hasattr(feature_extractor, "max_size"))
|
|
||||||
|
|
||||||
def test_batch_feature(self):
|
def test_batch_feature(self):
|
||||||
pass
|
pass
|
||||||
@@ -230,7 +237,7 @@ class YolosFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Test
|
|||||||
def test_equivalence_padding(self):
|
def test_equivalence_padding(self):
|
||||||
# Initialize feature_extractors
|
# Initialize feature_extractors
|
||||||
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
|
feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
|
||||||
# create random PyTorch tensors
|
# create random PyTorch tensors
|
||||||
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
|
||||||
for image in image_inputs:
|
for image in image_inputs:
|
||||||
@@ -328,7 +335,7 @@ class YolosFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.Test
|
|||||||
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
|
||||||
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
|
||||||
# verify masks
|
# verify masks
|
||||||
expected_masks_sum = 822338
|
expected_masks_sum = 822873
|
||||||
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
|
||||||
# verify orig_size
|
# verify orig_size
|
||||||
expected_orig_size = torch.tensor([480, 640])
|
expected_orig_size = torch.tensor([480, 640])
|
||||||
|
|||||||
Reference in New Issue
Block a user