Update examples with image processors (#21155)

* Update examples to use image processors * Small fixes * Resolve conflicts
2023-01-19 15:14:58 +00:00
parent fc8a93507c
commit 4bc18e7a83
12 changed files with 124 additions and 137 deletions
--- a/examples/pytorch/semantic-segmentation/README.md
+++ b/examples/pytorch/semantic-segmentation/README.md
@@ -40,7 +40,7 @@ from datasets import Dataset, DatasetDict, Image

 # your images can of course have a different extension
 # semantic segmentation maps are typically stored in the png format
-image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"] 
+image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
 label_paths_train = ["path/to/annotation_1.png", "path/to/annotation_2.png", ..., "path/to/annotation_n.png"]

 # same for validation
@@ -52,7 +52,7 @@ def create_dataset(image_paths, label_paths):
                                "label": sorted(label_paths)})
    dataset = dataset.cast_column("image", Image())
    dataset = dataset.cast_column("label", Image())
-    
+
    return dataset

 # step 1: create Dataset objects
@@ -91,7 +91,7 @@ You can easily upload this by clicking on "Add file" in the "Files and versions"

 ## PyTorch version, Trainer

-Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py). 
+Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).

 The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.

@@ -130,7 +130,7 @@ Note that you can replace the model and dataset by simply setting the `model_nam

 Based on the script [`run_semantic_segmentation_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).

-The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision. 
+The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.

 First, run:

@@ -161,11 +161,11 @@ The resulting model can be seen here: https://huggingface.co/nielsr/segformer-fi
 This means that after training, you can easily load your trained model as follows:

 ```python
-from transformers import AutoFeatureExtractor, AutoModelForSemanticSegmentation
+from transformers import AutoImageProcessor, AutoModelForSemanticSegmentation

 model_name = "name_of_repo_on_the_hub_or_path_to_local_folder"

-feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+image_processor = AutoImageProcessor.from_pretrained(model_name)
 model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
 ```

@@ -180,7 +180,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)

 # prepare image for the model
-inputs = feature_extractor(images=image, return_tensors="pt")
+inputs = image_processor(images=image, return_tensors="pt")

 with torch.no_grad():
    outputs = model(**inputs)
@@ -201,4 +201,4 @@ For visualization of the segmentation maps, we refer to the [example notebook](h

 Some datasets, like [`scene_parse_150`](https://huggingface.co/datasets/scene_parse_150), contain a "background" label that is not part of the classes. The Scene Parse 150 dataset for instance contains labels between 0 and 150, with 0 being the background class, and 1 to 150 being actual class names (like "tree", "person", etc.). For these kind of datasets, one replaces the background label (0) by 255, which is the `ignore_index` of the PyTorch model's loss function, and reduces all labels by 1. This way, the `labels` are PyTorch tensors containing values between 0 and 149, and 255 for all background/padding.

-In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
+In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -34,7 +34,7 @@ import transformers
 from huggingface_hub import hf_hub_download
 from transformers import (
    AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
    AutoModelForSemanticSegmentation,
    HfArgumentParser,
    Trainer,
@@ -240,7 +240,7 @@ class ModelArguments:
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
    use_auth_token: bool = field(
        default=False,
        metadata={
@@ -358,7 +358,7 @@ def main():
            references=labels,
            num_labels=len(id2label),
            ignore_index=0,
-            reduce_labels=feature_extractor.do_reduce_labels,
+            reduce_labels=image_processor.do_reduce_labels,
        )
        # add per category metrics as individual key-value pairs
        per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
@@ -385,8 +385,8 @@ def main():
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
@@ -395,11 +395,11 @@ def main():
    # Define torchvision transforms to be applied to each image + target.
    # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
    # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
-    if "shortest_edge" in feature_extractor.size:
+    if "shortest_edge" in image_processor.size:
        # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
-        size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
+        size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
    else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
+        size = (image_processor.size["height"], image_processor.size["width"])
    train_transforms = Compose(
        [
            ReduceLabels() if data_args.reduce_labels else Identity(),
@@ -407,7 +407,7 @@ def main():
            RandomHorizontalFlip(flip_prob=0.5),
            PILToTensor(),
            ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
        ]
    )
    # Define torchvision transform to be applied to each image.
@@ -418,7 +418,7 @@ def main():
            Resize(size=size),
            PILToTensor(),
            ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
        ]
    )

@@ -477,7 +477,7 @@ def main():
        train_dataset=dataset["train"] if training_args.do_train else None,
        eval_dataset=dataset["validation"] if training_args.do_eval else None,
        compute_metrics=compute_metrics,
-        tokenizer=feature_extractor,
+        tokenizer=image_processor,
        data_collator=default_data_collator,
    )

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -39,7 +39,7 @@ from accelerate.utils import set_seed
 from huggingface_hub import Repository, create_repo, hf_hub_download
 from transformers import (
    AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
    AutoModelForSemanticSegmentation,
    SchedulerType,
    default_data_collator,
@@ -397,20 +397,20 @@ def main():
    id2label = {int(k): v for k, v in id2label.items()}
    label2id = {v: k for k, v in id2label.items()}

-    # Load pretrained model and feature extractor
+    # Load pretrained model and image processor
    config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id)
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
+    image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
    model = AutoModelForSemanticSegmentation.from_pretrained(args.model_name_or_path, config=config)

    # Preprocessing the datasets
    # Define torchvision transforms to be applied to each image + target.
    # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
    # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
-    if "shortest_edge" in feature_extractor.size:
+    if "shortest_edge" in image_processor.size:
        # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
-        size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
+        size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
    else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
+        size = (image_processor.size["height"], image_processor.size["width"])
    train_transforms = Compose(
        [
            ReduceLabels() if args.reduce_labels else Identity(),
@@ -418,7 +418,7 @@ def main():
            RandomHorizontalFlip(flip_prob=0.5),
            PILToTensor(),
            ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
        ]
    )
    # Define torchvision transform to be applied to each image.
@@ -429,7 +429,7 @@ def main():
            Resize(size=size),
            PILToTensor(),
            ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
        ]
    )

@@ -602,7 +602,7 @@ def main():
                            save_function=accelerator.save,
                        )
                        if accelerator.is_main_process:
-                            feature_extractor.save_pretrained(args.output_dir)
+                            image_processor.save_pretrained(args.output_dir)
                            repo.push_to_hub(
                                commit_message=f"Training in progress {completed_steps} steps",
                                blocking=False,
@@ -657,7 +657,7 @@ def main():
                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
            )
            if accelerator.is_main_process:
-                feature_extractor.save_pretrained(args.output_dir)
+                image_processor.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                )
@@ -678,7 +678,7 @@ def main():
            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
        )
        if accelerator.is_main_process:
-            feature_extractor.save_pretrained(args.output_dir)
+            image_processor.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)