Update object detection with latest resize and pad strategies (#30955)
* Update with new resizing and pad strategy * Return pixel mask param * Update inference in guide * Fix empty compose * Update guide
This commit is contained in:
committed by
GitHub
parent
a25f7d3c12
commit
15585b81a5
@@ -206,10 +206,10 @@ Instantiate the image processor from the same checkpoint as the model you want t
|
|||||||
|
|
||||||
>>> image_processor = AutoImageProcessor.from_pretrained(
|
>>> image_processor = AutoImageProcessor.from_pretrained(
|
||||||
... MODEL_NAME,
|
... MODEL_NAME,
|
||||||
... # At this moment we recommend using external transform to pad and resize images.
|
... do_resize=True,
|
||||||
... # It`s faster and yields better results for object-detection models.
|
... size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
|
||||||
... do_pad=False,
|
... do_pad=True,
|
||||||
... do_resize=False,
|
... pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
|
||||||
... )
|
... )
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -217,38 +217,28 @@ Before passing the images to the `image_processor`, apply two preprocessing tran
|
|||||||
- Augmenting images
|
- Augmenting images
|
||||||
- Reformatting annotations to meet DETR expectations
|
- Reformatting annotations to meet DETR expectations
|
||||||
|
|
||||||
First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ...
|
First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/).
|
||||||
This library ensures that transformations affect the image and update the bounding boxes accordingly.
|
This library ensures that transformations affect the image and update the bounding boxes accordingly.
|
||||||
The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
|
The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
|
||||||
and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480),
|
and it uses the exact same dataset as an example. Apply some geometric and color transformations to the image. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
|
||||||
flip it horizontally, and brighten it. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
|
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> import albumentations as A
|
>>> import albumentations as A
|
||||||
|
|
||||||
>>> max_size = IMAGE_SIZE
|
>>> max_size = IMAGE_SIZE
|
||||||
|
|
||||||
>>> # Resize image longest edge to 480 and then pad image to square 480x480.
|
|
||||||
>>> # This padding and resizing strategy give better results, see
|
|
||||||
>>> # https://github.com/huggingface/transformers/pull/30422#discussion_r1584647408
|
|
||||||
>>> basic_transforms = [
|
|
||||||
... A.LongestMaxSize(max_size=max_size),
|
|
||||||
... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
|
|
||||||
... ]
|
|
||||||
|
|
||||||
>>> train_augment_and_transform = A.Compose(
|
>>> train_augment_and_transform = A.Compose(
|
||||||
... [
|
... [
|
||||||
... A.Perspective(p=0.1),
|
... A.Perspective(p=0.1),
|
||||||
... A.HorizontalFlip(p=0.5),
|
... A.HorizontalFlip(p=0.5),
|
||||||
... A.RandomBrightnessContrast(p=0.5),
|
... A.RandomBrightnessContrast(p=0.5),
|
||||||
... A.HueSaturationValue(p=0.1),
|
... A.HueSaturationValue(p=0.1),
|
||||||
... *basic_transforms,
|
|
||||||
... ],
|
... ],
|
||||||
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
|
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
|
||||||
... )
|
... )
|
||||||
|
|
||||||
>>> validation_transform = A.Compose(
|
>>> validation_transform = A.Compose(
|
||||||
... basic_transforms,
|
... [A.NoOp()],
|
||||||
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
|
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
|
||||||
... )
|
... )
|
||||||
```
|
```
|
||||||
@@ -294,7 +284,7 @@ The `image_processor` expects the annotations to be in the following format: `{'
|
|||||||
Now you can combine the image and annotation transformations to use on a batch of examples:
|
Now you can combine the image and annotation transformations to use on a batch of examples:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> def augment_and_transform_batch(examples, transform, image_processor):
|
>>> def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
|
||||||
... """Apply augmentations and format annotations in COCO format for object detection task"""
|
... """Apply augmentations and format annotations in COCO format for object detection task"""
|
||||||
|
|
||||||
... images = []
|
... images = []
|
||||||
@@ -315,6 +305,9 @@ Now you can combine the image and annotation transformations to use on a batch o
|
|||||||
... # Apply the image processor transformations: resizing, rescaling, normalization
|
... # Apply the image processor transformations: resizing, rescaling, normalization
|
||||||
... result = image_processor(images=images, annotations=annotations, return_tensors="pt")
|
... result = image_processor(images=images, annotations=annotations, return_tensors="pt")
|
||||||
|
|
||||||
|
... if not return_pixel_mask:
|
||||||
|
... result.pop("pixel_mask", None)
|
||||||
|
|
||||||
... return result
|
... return result
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -1485,25 +1478,12 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin
|
|||||||
```py
|
```py
|
||||||
>>> import torch
|
>>> import torch
|
||||||
>>> import requests
|
>>> import requests
|
||||||
>>> import numpy as np
|
|
||||||
>>> import albumentations as A
|
|
||||||
|
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image, ImageDraw
|
||||||
>>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
|
>>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
|
||||||
|
|
||||||
>>> url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
|
>>> url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
|
||||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
>>> # Define transformations for inference
|
|
||||||
>>> resize_and_pad = A.Compose([
|
|
||||||
... A.LongestMaxSize(max_size=max_size),
|
|
||||||
... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
|
|
||||||
... ])
|
|
||||||
|
|
||||||
>>> # This one is for visualization with no padding
|
|
||||||
>>> resize_only = A.Compose([
|
|
||||||
... A.LongestMaxSize(max_size=max_size),
|
|
||||||
... ])
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
|
Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
|
||||||
@@ -1519,12 +1499,11 @@ Load model and image processor from the Hugging Face Hub (skip to use already tr
|
|||||||
And detect bounding boxes:
|
And detect bounding boxes:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> np_preprocessed_image = resize_and_pad(image=np.array(image))["image"]
|
|
||||||
|
|
||||||
>>> with torch.no_grad():
|
>>> with torch.no_grad():
|
||||||
... inputs = image_processor(images=[np_preprocessed_image], return_tensors="pt")
|
... inputs = image_processor(images=[image], return_tensors="pt")
|
||||||
... outputs = model(inputs["pixel_values"].to(device))
|
... outputs = model(**inputs.to(device))
|
||||||
... target_sizes = torch.tensor([np_preprocessed_image.shape[:2]])
|
... target_sizes = torch.tensor([[image.size[1], image.size[0]]])
|
||||||
... results = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]
|
... results = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]
|
||||||
|
|
||||||
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
||||||
@@ -1543,9 +1522,7 @@ Detected Coverall with confidence 0.391 at location [68.61, 126.66, 309.03, 318.
|
|||||||
Let's plot the result:
|
Let's plot the result:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> resized_image = resize_only(image=np.array(image))["image"]
|
>>> draw = ImageDraw.Draw(image)
|
||||||
>>> resized_image = Image.fromarray(resized_image)
|
|
||||||
>>> draw = ImageDraw.Draw(resized_image)
|
|
||||||
|
|
||||||
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
||||||
... box = [round(i, 2) for i in box.tolist()]
|
... box = [round(i, 2) for i in box.tolist()]
|
||||||
@@ -1553,7 +1530,7 @@ Let's plot the result:
|
|||||||
... draw.rectangle((x, y, x2, y2), outline="red", width=1)
|
... draw.rectangle((x, y, x2, y2), outline="red", width=1)
|
||||||
... draw.text((x, y), model.config.id2label[label.item()], fill="white")
|
... draw.text((x, y), model.config.id2label[label.item()], fill="white")
|
||||||
|
|
||||||
>>> resized_image
|
>>> image
|
||||||
```
|
```
|
||||||
|
|
||||||
<div class="flex justify-center">
|
<div class="flex justify-center">
|
||||||
|
|||||||
@@ -200,6 +200,7 @@ Where `metadata.jsonl` is a file with the following structure:
|
|||||||
{"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}}
|
{"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}}
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
Trining script support bounding boxes in COCO format (x_min, y_min, width, height).
|
||||||
|
|
||||||
Then, you cat load the dataset with just a few lines of code:
|
Then, you cat load the dataset with just a few lines of code:
|
||||||
|
|
||||||
|
|||||||
@@ -117,7 +117,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
|
|||||||
|
|
||||||
|
|
||||||
def augment_and_transform_batch(
|
def augment_and_transform_batch(
|
||||||
examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
|
examples: Mapping[str, Any],
|
||||||
|
transform: A.Compose,
|
||||||
|
image_processor: AutoImageProcessor,
|
||||||
|
return_pixel_mask: bool = False,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""Apply augmentations and format annotations in COCO format for object detection task"""
|
"""Apply augmentations and format annotations in COCO format for object detection task"""
|
||||||
|
|
||||||
@@ -139,6 +142,9 @@ def augment_and_transform_batch(
|
|||||||
# Apply the image processor transformations: resizing, rescaling, normalization
|
# Apply the image processor transformations: resizing, rescaling, normalization
|
||||||
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
|
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
|
||||||
|
|
||||||
|
if not return_pixel_mask:
|
||||||
|
result.pop("pixel_mask", None)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@@ -415,12 +421,10 @@ def main():
|
|||||||
)
|
)
|
||||||
image_processor = AutoImageProcessor.from_pretrained(
|
image_processor = AutoImageProcessor.from_pretrained(
|
||||||
model_args.image_processor_name or model_args.model_name_or_path,
|
model_args.image_processor_name or model_args.model_name_or_path,
|
||||||
# At this moment we recommend using external transform to pad and resize images.
|
do_resize=True,
|
||||||
# It`s faster and yields much better results for object-detection models.
|
size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size},
|
||||||
do_pad=False,
|
do_pad=True,
|
||||||
do_resize=False,
|
pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size},
|
||||||
# We will save image size parameter in config just for reference
|
|
||||||
size={"longest_edge": data_args.image_square_size},
|
|
||||||
**common_pretrained_args,
|
**common_pretrained_args,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -428,10 +432,6 @@ def main():
|
|||||||
# Define image augmentations and dataset transforms
|
# Define image augmentations and dataset transforms
|
||||||
# ------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------
|
||||||
max_size = data_args.image_square_size
|
max_size = data_args.image_square_size
|
||||||
basic_transforms = [
|
|
||||||
A.LongestMaxSize(max_size=max_size),
|
|
||||||
A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
|
|
||||||
]
|
|
||||||
train_augment_and_transform = A.Compose(
|
train_augment_and_transform = A.Compose(
|
||||||
[
|
[
|
||||||
A.Compose(
|
A.Compose(
|
||||||
@@ -453,12 +453,11 @@ def main():
|
|||||||
A.HorizontalFlip(p=0.5),
|
A.HorizontalFlip(p=0.5),
|
||||||
A.RandomBrightnessContrast(p=0.5),
|
A.RandomBrightnessContrast(p=0.5),
|
||||||
A.HueSaturationValue(p=0.1),
|
A.HueSaturationValue(p=0.1),
|
||||||
*basic_transforms,
|
|
||||||
],
|
],
|
||||||
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
|
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
|
||||||
)
|
)
|
||||||
validation_transform = A.Compose(
|
validation_transform = A.Compose(
|
||||||
basic_transforms,
|
[A.NoOp()],
|
||||||
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
|
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -120,7 +120,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
|
|||||||
|
|
||||||
# Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
|
# Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
|
||||||
def augment_and_transform_batch(
|
def augment_and_transform_batch(
|
||||||
examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
|
examples: Mapping[str, Any],
|
||||||
|
transform: A.Compose,
|
||||||
|
image_processor: AutoImageProcessor,
|
||||||
|
return_pixel_mask: bool = False,
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""Apply augmentations and format annotations in COCO format for object detection task"""
|
"""Apply augmentations and format annotations in COCO format for object detection task"""
|
||||||
|
|
||||||
@@ -142,6 +145,9 @@ def augment_and_transform_batch(
|
|||||||
# Apply the image processor transformations: resizing, rescaling, normalization
|
# Apply the image processor transformations: resizing, rescaling, normalization
|
||||||
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
|
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
|
||||||
|
|
||||||
|
if not return_pixel_mask:
|
||||||
|
result.pop("pixel_mask", None)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@@ -473,12 +479,10 @@ def main():
|
|||||||
)
|
)
|
||||||
image_processor = AutoImageProcessor.from_pretrained(
|
image_processor = AutoImageProcessor.from_pretrained(
|
||||||
args.model_name_or_path,
|
args.model_name_or_path,
|
||||||
# At this moment we recommend using external transform to pad and resize images.
|
do_resize=True,
|
||||||
# It`s faster and yields much better results for object-detection models.
|
size={"max_height": args.image_square_size, "max_width": args.image_square_size},
|
||||||
do_pad=False,
|
do_pad=True,
|
||||||
do_resize=False,
|
pad_size={"height": args.image_square_size, "width": args.image_square_size},
|
||||||
# We will save image size parameter in config just for reference
|
|
||||||
size={"longest_edge": args.image_square_size},
|
|
||||||
**common_pretrained_args,
|
**common_pretrained_args,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -486,10 +490,6 @@ def main():
|
|||||||
# Define image augmentations and dataset transforms
|
# Define image augmentations and dataset transforms
|
||||||
# ------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------
|
||||||
max_size = args.image_square_size
|
max_size = args.image_square_size
|
||||||
basic_transforms = [
|
|
||||||
A.LongestMaxSize(max_size=max_size),
|
|
||||||
A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
|
|
||||||
]
|
|
||||||
train_augment_and_transform = A.Compose(
|
train_augment_and_transform = A.Compose(
|
||||||
[
|
[
|
||||||
A.Compose(
|
A.Compose(
|
||||||
@@ -511,12 +511,11 @@ def main():
|
|||||||
A.HorizontalFlip(p=0.5),
|
A.HorizontalFlip(p=0.5),
|
||||||
A.RandomBrightnessContrast(p=0.5),
|
A.RandomBrightnessContrast(p=0.5),
|
||||||
A.HueSaturationValue(p=0.1),
|
A.HueSaturationValue(p=0.1),
|
||||||
*basic_transforms,
|
|
||||||
],
|
],
|
||||||
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
|
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
|
||||||
)
|
)
|
||||||
validation_transform = A.Compose(
|
validation_transform = A.Compose(
|
||||||
basic_transforms,
|
[A.NoOp()],
|
||||||
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
|
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user